# Chapter 02
## 初始化设置

In [3]:
from pyspark import SparkContext , SparkConf

In [4]:
conf = SparkConf().setAppName('chapter2')
sc = SparkContext(conf=conf)
sc

Spark UI: localhost:4040

## 导入数据

### 从数组导入数据

In [5]:
data = sc.parallelize(
[('Amber', 22), ('Alfred', 23), ('Skye',4), ('Albert', 12),
('Amber', 9)])
data

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

### 从文件导入数据

- 可以通过本地
- 也可通过 HDFS,AWS 等

In [6]:
data_fromfile = sc.textFile("VS14MORT.txt")
data_fromfile

VS14MORT.txt MapPartitionsRDD[2] at textFile at NativeMethodAccessorImpl.java:0

## Collect Data

In [7]:
data.collect()

[('Amber', 22), ('Alfred', 23), ('Skye', 4), ('Albert', 12), ('Amber', 9)]

### Reading from files

In [8]:
data_fromfile.take(1)

['                   1                                          2101  M1087 432311  4M4                2014U7CN                                    I64 238 070   24 0111I64                                                                                                                                                                           01 I64                                                                                                  01  11                                 100 601']

### Example of .map()

In [49]:
def extractInformation(row):
    import re
    import numpy as np
    selected_indices = [
         2,4,5,6,7,9,10,11,12,13,14,15,16,17,18,
         19,21,22,23,24,25,27,28,29,30,32,33,34,
         36,37,38,39,40,41,42,43,44,45,46,47,48,
         49,50,51,52,53,54,55,56,58,60,61,62,63,
         64,65,66,67,68,69,70,71,72,73,74,75,76,
         77,78,79,81,82,83,84,85,87,89
    ]
    record_split = re\
        .compile(
            r'([\s]{19})([0-9]{1})([\s]{40})([0-9\s]{2})([0-9\s]{1})([0-9]{1})([0-9]{2})' + 
            r'([\s]{2})([FM]{1})([0-9]{1})([0-9]{3})([0-9\s]{1})([0-9]{2})([0-9]{2})' + 
            r'([0-9]{2})([0-9\s]{2})([0-9]{1})([SMWDU]{1})([0-9]{1})([\s]{16})([0-9]{4})' +
            r'([YNU]{1})([0-9\s]{1})([BCOU]{1})([YNU]{1})([\s]{34})([0-9\s]{1})([0-9\s]{1})' +
            r'([A-Z0-9\s]{4})([0-9]{3})([\s]{1})([0-9\s]{3})([0-9\s]{3})([0-9\s]{2})([\s]{1})' + 
            r'([0-9\s]{2})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})' + 
            r'([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})' + 
            r'([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})' + 
            r'([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})' + 
            r'([A-Z0-9\s]{7})([\s]{36})([A-Z0-9\s]{2})([\s]{1})([A-Z0-9\s]{5})([A-Z0-9\s]{5})' + 
            r'([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})' + 
            r'([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})' + 
            r'([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})' + 
            r'([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([\s]{1})([0-9\s]{2})([0-9\s]{1})' + 
            r'([0-9\s]{1})([0-9\s]{1})([0-9\s]{1})([\s]{33})([0-9\s]{3})([0-9\s]{1})([0-9\s]{1})')
    try:
        rs = np.array(record_split.split(row))[selected_indices]
    except:
        rs = np.array(['-99'] * len(selected_indices))
    return rs

In [50]:
data_fromfile_conv = data_fromfile.map(extractInformation)
data_fromfile_conv.take(1)

[array(['1', '  ', '2', '1', '01', 'M', '1', '087', ' ', '43', '23', '11',
        '  ', '4', 'M', '4', '2014', 'U', '7', 'C', 'N', ' ', ' ', 'I64 ',
        '238', '070', '   ', '24', '01', '11I64  ', '       ', '       ',
        '       ', '       ', '       ', '       ', '       ', '       ',
        '       ', '       ', '       ', '       ', '       ', '       ',
        '       ', '       ', '       ', '       ', '       ', '01',
        'I64  ', '     ', '     ', '     ', '     ', '     ', '     ',
        '     ', '     ', '     ', '     ', '     ', '     ', '     ',
        '     ', '     ', '     ', '     ', '     ', '     ', '01', ' ',
        ' ', '1', '1', '100', '6'], dtype='<U40')]

## Transformation

### `.map()`

In [53]:
data_2014 = data_fromfile_conv.map(lambda row: int(row[16]))
data_2014.take(10)

[2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, -99]

In [56]:
data_2014_2 = data_fromfile_conv.map(
    lambda row: (row[16], int(row[16])))
data_2014_2.take(5)

[('2014', 2014),
 ('2014', 2014),
 ('2014', 2014),
 ('2014', 2014),
 ('2014', 2014)]

### `.filter()`
Another most often used transformation is the `.filter(...)` method, which allows you to select elements from your dataset that fit specified criteria.

In [57]:
data_filtered = data_fromfile_conv.filter(lambda row: row[5] == 'F' and row[21] == '0')
data_filtered.count()

6

### `.flatMap()`
The `.flatMap(...)` method works similarly to .map(...), but it returns a flattened result instead of a list.

In [58]:
data_2014_flat = data_fromfile_conv.flatMap(lambda row: (row[16], int(row[16]) + 1))
data_2014_flat.take(10)

['2014', 2015, '2014', 2015, '2014', 2015, '2014', 2015, '2014', 2015]

### `distinct()`
This method returns a list of distinct values in a specified column. It is extremely useful if you want to get to know your dataset or validate it.

In [59]:
distinct_gender = data_fromfile_conv.map(lambda row: row[5]).distinct()
distinct_gender.collect()

['-99', 'M', 'F']

> Note that this is an expensive method and should be used sparingly and only when necessary as it shuffles the data around.

## `sample()`
The .sample(...) method returns a randomized sample from the dataset.
- The first parameter specifies whether the sampling should be with a replacement, 
- the second parameter defines the fraction of the data to return, 
- and the third is seed to the pseudo-random numbers generator:

In [62]:
fraction = 0.1
data_sample = data_fromfile_conv.sample(False, fraction, 666)
print('Original dataset: {0}, sample: {1}'\
.format(data_fromfile_conv.count(), data_sample.count()))

Original dataset: 2631171, sample: 263161


### `leftOuterJoin()`
.leftOuterJoin(...), just like in the SQL world, joins two RDDs based on the values found in both datasets, and returns records from the left RDD with records from the right one appended in places where the two RDDs match:

In [63]:
rdd1 = sc.parallelize([('a', 1), ('b', 4), ('c',10)])
rdd2 = sc.parallelize([('a', 4), ('a', 1), ('b', '6'), ('d', 15)])
rdd3 = rdd1.leftOuterJoin(rdd2)
rdd3.collect()

[('a', (1, 4)), ('a', (1, 1)), ('b', (4, '6')), ('c', (10, None))]

> This is another expensive method and should be used sparingly and only when necessary as it shuffles the data around causing a performance hit.

If we used the .join(...) method instead we would have got only the values for 'a' and 'b' as these two values intersect between these two RDDs.

In [64]:
rdd4 = rdd1.join(rdd2)

rdd4.collect()

[('a', (1, 4)), ('a', (1, 1)), ('b', (4, '6'))]

Another useful method is .intersection(...), which returns the records that are equal in both RDDs. Execute the following code:

In [65]:
rdd5 = rdd1.intersection(rdd2)
rdd5.collect()

[('a', 1)]

### `repartition()`
Repartitioning the dataset changes the number of partitions that the dataset is divided into. This functionality should be used sparingly and only when really necessary as it shuffles the data around, which in effect results in a significant hit in terms of performance:

In [66]:
rdd1 = rdd1.repartition(4)
len(rdd1.glom().collect())

4

The .glom() method, in contrast to .collect(), produces a list where each element is another list of all elements of the dataset present in a specified partition; the main list returned has as many elements as the number of partitions.