In [6]:
import pyspark

In [33]:
from pyspark.sql import SparkSession, functions as F

In [11]:
spark = SparkSession.builder.getOrCreate()

22/10/18 00:30:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


![Mapper_Tranformation](img/mapper_transformations.png)

- lazy evaluation: transformation will not be evaluated util action is executed
- help to optimize: Spark can look at DAG entirely

## 1. map() transformation

In [8]:
help(pyspark.RDD.map)

Help on function map in module pyspark.rdd:

map(self, f, preservesPartitioning=False)
    Return a new RDD by applying a function to each element of this RDD.
    
    >>> rdd = sc.parallelize(["b", "a", "c"])
    >>> sorted(rdd.map(lambda x: (x, 1)).collect())
    [('a', 1), ('b', 1), ('c', 1)]



### 1.1 RDD mapper

In [9]:
def relu_func(x):
    if x < 0:
        return 0
    return x

In [17]:
data = [1, -1, -2, 3, 4]
rdd_sample_2 = spark.sparkContext.parallelize(data)
rdd_sample_2.collect()

[1, -1, -2, 3, 4]

In [13]:
rdd_relu = rdd_sample_2.map(lambda x : relu_func(x))
rdd_relu.collect()

[1, 0, 0, 3, 4]

In [16]:
# or
rdd_relu = rdd_sample_2.map(relu_func)
rdd_relu.collect()

[1, 0, 0, 3, 4]

- add new field to value

In [18]:
pairs = [('a', 2), ('b', -1), ('d', -2), ('e', 3)]
rdd_pairs = spark.sparkContext.parallelize(pairs)
rdd_pairs.collect()

[('a', 2), ('b', -1), ('d', -2), ('e', 3)]

In [23]:
rdd_pairs_relu = rdd_pairs.map(lambda pair: (pair[0], (pair[1], relu_func(pair[1]) )))
rdd_pairs_relu.collect()

[('a', (2, 2)), ('b', (-1, 0)), ('d', (-2, 0)), ('e', (3, 3))]

- more advance mapper function

In [26]:
def parse_record(record:str):
    token = record.split(",")
    name = token[1]
    age = int(token[2])
    num_friends = int(token[3])
    return (name, (age, num_friends))

In [27]:
rdd_user = spark.sparkContext.textFile("data/users.txt")
rdd_pair = rdd_user.map(parse_record)
rdd_pair.collect()

[('Alex', (30, 124)),
 ('Bert', (32, 234)),
 ('Curt', (28, 312)),
 ('Don', (32, 180)),
 ('Mary', (30, 100)),
 ('Jane', (28, 212)),
 ('Joe', (28, 128)),
 ('Al', (40, 600))]

### 1.2 DataFrame Mapper

- rdd.map ~ use DataFrame.withColumn() then DataFrame.drop()

In [28]:
rdd_user_degree = [ ('alex', 440, 'PHD'), ('jane', 420, 'PHD'),
                    ('bob', 280, 'MS'), ('betty', 200, 'MS'),
                    ('ted', 180, 'BS'), ('mary', 100, 'BS') ]

In [29]:
df_user_degree = spark.createDataFrame(rdd_user_degree,
                                       schema=["name", "amount", "education"])
df_user_degree.show()

+-----+------+---------+
| name|amount|education|
+-----+------+---------+
| alex|   440|      PHD|
| jane|   420|      PHD|
|  bob|   280|       MS|
|betty|   200|       MS|
|  ted|   180|       BS|
| mary|   100|       BS|
+-----+------+---------+



#### 1.2.1 RDD approaches

In [30]:
# create bonus columns = 10% amount using RDD mapper
df_bonus = (df_user_degree.rdd # convert to RDD
            .map(lambda x: (x["name"], x["amount"], x["education"], int(x["amount"])/10))
            .toDF(["name", "amount", "education", "bonus"])
           )
df_bonus.show()

+-----+------+---------+-----+
| name|amount|education|bonus|
+-----+------+---------+-----+
| alex|   440|      PHD| 44.0|
| jane|   420|      PHD| 42.0|
|  bob|   280|       MS| 28.0|
|betty|   200|       MS| 20.0|
|  ted|   180|       BS| 18.0|
| mary|   100|       BS| 10.0|
+-----+------+---------+-----+



In [32]:
df_bonus = (df_user_degree.rdd
            .map(lambda x: x + (str(x["amount"]/10),))
            .toDF(df_user_degree.columns + ["bonus"])
           )
df_bonus.show()

+-----+------+---------+-----+
| name|amount|education|bonus|
+-----+------+---------+-----+
| alex|   440|      PHD| 44.0|
| jane|   420|      PHD| 42.0|
|  bob|   280|       MS| 28.0|
|betty|   200|       MS| 20.0|
|  ted|   180|       BS| 18.0|
| mary|   100|       BS| 10.0|
+-----+------+---------+-----+



#### 1.2.2 withColumn approach

In [35]:
df_bonus = df_user_degree.withColumn("bonus", F.lit(df_user_degree.amount/10))
df_bonus.show()

+-----+------+---------+-----+
| name|amount|education|bonus|
+-----+------+---------+-----+
| alex|   440|      PHD| 44.0|
| jane|   420|      PHD| 42.0|
|  bob|   280|       MS| 28.0|
|betty|   200|       MS| 20.0|
|  ted|   180|       BS| 18.0|
| mary|   100|       BS| 10.0|
+-----+------+---------+-----+



#### 1.2.3 Mapper multiple DataFrame

In [36]:
def compute_bonus(amount, education):
    '''
    Calculate bonus base on education
    if PhD ration = 0.3, master=0.2 and BS=0.1 total amount
    '''
    if education == "PHD": return int(amount * 0.30)
    if education == "MS": return int(amount * 0.20)
    return int(amount * 0.10)

In [37]:
import pyspark.sql.types as T

In [38]:
udf_coumpute_bonus = F.udf(lambda amount, education:compute_bonus(amount, education),
                          T.IntegerType())

In [40]:
df_bonus = (df_user_degree
            .withColumn("bonus",
                        udf_coumpute_bonus(df_user_degree.amount, df_user_degree.education))
           )
df_bonus.show()

+-----+------+---------+-----+
| name|amount|education|bonus|
+-----+------+---------+-----+
| alex|   440|      PHD|  132|
| jane|   420|      PHD|  126|
|  bob|   280|       MS|   56|
|betty|   200|       MS|   40|
|  ted|   180|       BS|   18|
| mary|   100|       BS|   10|
+-----+------+---------+-----+



## 2. flatMap()

- make sure that the source RDD’s elements are iterable

### 2.1 Basic

In [42]:
help(pyspark.RDD.flatMap)

Help on function flatMap in module pyspark.rdd:

flatMap(self, f, preservesPartitioning=False)
    Return a new RDD by first applying a function to all elements of this
    RDD, and then flattening the results.
    
    >>> rdd = sc.parallelize([2, 3, 4])
    >>> sorted(rdd.flatMap(lambda x: range(1, x)).collect())
    [1, 1, 1, 2, 2, 3]
    >>> sorted(rdd.flatMap(lambda x: [(x, x), (x, x)]).collect())
    [(2, 2), (2, 2), (3, 3), (3, 3), (4, 4), (4, 4)]



In [43]:
words = ["a", "red", "of", "fox", "jumped"]
rdd_words = spark.sparkContext.parallelize(words)
rdd_words.count()

5

In [50]:
def my_flatmap_func(x):
    if len(x) < 3:
        return [x.lower(), x.lower()]
    else:
        return [x.upper(), x.upper()]

In [51]:
rdd_flattened = rdd_words.flatMap(my_flatmap_func)
rdd_flattened.count()

10

In [52]:
rdd_flattened.collect()

['a', 'a', 'RED', 'RED', 'of', 'of', 'FOX', 'FOX', 'JUMPED', 'JUMPED']

### 2.2 flatMap() with text processing

In [53]:
import string, re

In [54]:
def no_punctuation(record_str):
    exclude = set(string.punctuation)
    t = ''.join(ch for ch in record_str if ch not in exclude)
    trimmed = re.sub('\s+',' ', t)
    return trimmed

In [55]:
sentences = ["Fox, ran 2 fast!!!", "Fox, jumped; of fence!!!"]
rdd_sen = spark.sparkContext.parallelize(sentences)
rdd_sen.collect()

['Fox, ran 2 fast!!!', 'Fox, jumped; of fence!!!']

In [57]:
rdd_cleaned = rdd_sen.map(no_punctuation)
flattened = rdd_cleaned.flatMap(lambda v: v.split(" "))
final_rdd = flattened.filter(lambda w: len(w) > 2)
final_rdd.collect()

['Fox', 'ran', 'fast', 'Fox', 'jumped', 'fence']

### 2.3 Apply flatMap() to a DataFrame - F.explode(col)

In [64]:
programmer = [('alex', ['Java','Scala', 'Python'], ['MS', 'PHD']),
              ('jane', ['Cobol','Snobol'], ['BS', 'MS']),
              ('bob', ['C++'], ['BS', 'MS', 'PHD']),
              ('ted', ['Julia'], ['BS', 'MS']),
              ('max', ['FORTRAN'], []),
              ('dan', ['R'], [])]

In [65]:
df_programmer = spark.createDataFrame(data=programmer,
                                      schema = ['name', 'languages', 'education'])
df_programmer.show(truncate=False)

+----+---------------------+-------------+
|name|languages            |education    |
+----+---------------------+-------------+
|alex|[Java, Scala, Python]|[MS, PHD]    |
|jane|[Cobol, Snobol]      |[BS, MS]     |
|bob |[C++]                |[BS, MS, PHD]|
|ted |[Julia]              |[BS, MS]     |
|max |[FORTRAN]            |[]           |
|dan |[R]                  |[]           |
+----+---------------------+-------------+



In [74]:
df_programmer_exploded = (df_programmer
                          .select(df_programmer.name,
                                  F.explode(df_programmer.languages).alias('language'),
                                  df_programmer.education
                                 )
                         )

df_programmer_exploded = (df_programmer_exploded
                          .select(df_programmer_exploded.name,
                                  df_programmer_exploded.language,
                                  F.explode(df_programmer_exploded.education).alias('education')
                                 )
                         )
df_programmer_exploded.show(truncate=False)

+----+--------+---------+
|name|language|education|
+----+--------+---------+
|alex|Java    |MS       |
|alex|Java    |PHD      |
|alex|Scala   |MS       |
|alex|Scala   |PHD      |
|alex|Python  |MS       |
|alex|Python  |PHD      |
|jane|Cobol   |BS       |
|jane|Cobol   |MS       |
|jane|Snobol  |BS       |
|jane|Snobol  |MS       |
|bob |C++     |BS       |
|bob |C++     |MS       |
|bob |C++     |PHD      |
|ted |Julia   |BS       |
|ted |Julia   |MS       |
+----+--------+---------+

