In [None]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName('PracticeRDD')
sc = SparkContext(conf=conf).getOrCreate()


In [None]:
n = '\n'
sample_weather_file = '/Users/juliushernandez/Documents/GitHub/__dc_pipelines__/evergreen/evergreen_custom_payloads_sample_data/payload.com.apple.hai.Evergreen.Weather.json'
nums_file = 'data/nums.txt'
txt_file = 'data/text.txt'
mango_txt = 'data/mango.txt'

# Practice RDD Functions

In [1]:
"""
currentConditions ✅
eventTime ✅
eventTimezone ✅
hourlyForecast ✅
payloadVersion ✅
sampleID ✅
"""
def txt_split(s):
    s_list = s.split(' ')
    l2 = []
    for v in s_list:
        l2.append(int(v) * 2)
    return l2

def letter_count(s):
    words = s.split(' ')
    word_lengths = []
    for word in words:
        word_lengths.append(len(word))
    return word_lengths

def practice1():
    rdd_text = sc.textFile(nums_file)
    l_text = rdd_text.collect()
    print(l_text)

    rdd_text2 = rdd_text.map(txt_split)
    l_text2 = rdd_text2.collect()
    print(l_text2)

    rdd_words = sc.textFile(txt_file)
    print('\n--> getting lengths of each word for\n', rdd_words.collect(), '\n')
    l_words = rdd_words.map(letter_count)
    l_words = l_words.collect()
    print('\nlengths =\n', l_words)

def practice_map():
    rdd_words = sc.textFile(txt_file)

    rdd_pipe_words_map = rdd_words.map(lambda s: [len(s) for s in s.split(' ')])
    print('\n_> length of all the words\n', rdd_pipe_words_map.collect())

    rdd_pipe_words_flat_map = rdd_words.flatMap(lambda s: s.split(' '))
    rdd_pipe_words_flat_map_len = rdd_words.flatMap(lambda s: [len(s) for s in s.split(' ')])
    print('\n_> Flat Map version\n', rdd_pipe_words_flat_map.collect(),
          '\n', rdd_pipe_words_flat_map_len.collect())

def practice_filter():
    rdd_mango = sc.textFile(mango_txt)
    def filter_ac(s):
        if s[0] in ['a', 'c']:
            return False
        return True

    r_p_mango = rdd_mango.filter(filter_ac)
    print('\n_> filtered result:\n', r_p_mango.collect())

def practice_distinct():
    rdd_nums = sc.textFile(nums_file)
    r_p_nums = rdd_nums.flatMap(lambda s: s.split(' '))
    r_p_nums_distinct = r_p_nums.distinct()
    # chain
    r_p_nums_distinct_chain = rdd_nums.flatMap(lambda s: s.split(' ')).distinct().collect()
    print('_> flat map:\n', r_p_nums.collect(),
          n, r_p_nums_distinct.collect(),
          n, r_p_nums_distinct_chain)

def groupbykey_prac():
    rdd_mango = sc.textFile(mango_txt)
    r_p_mango = rdd_mango.flatMap(lambda s: s.split(' ')).map(lambda s: (s, len(s)))
    print(r_p_mango.groupByKey().mapValues(list).collect())

def reducebykey_prac():
    rdd_nums = sc.textFile(nums_file)
    r_p_map = rdd_nums.flatMap(lambda s: s.split(' ')).map(lambda s: (s, 1))
    r_p_map.collect()
    r_p_reduce = r_p_map.reduceByKey(lambda a, b: a + b)
    print(r_p_reduce.collect())

def word_count_quiz():
    rdd_mango = sc.textFile(mango_txt)
    r_p_mango = rdd_mango.flatMap(lambda s: s.split(' ')).map(lambda s: (s, 1))
    r_p_mango_reduce = r_p_mango.reduceByKey(lambda a, b: a + b)
    print( r_p_mango_reduce.collect() )


# end of cell

# Main Practice Cell

In [39]:
rdd_mango = sc.textFile(mango_txt)
rdd_mango.collect()

['this mango company animal',
 'cat dog ant mic laptop mango',
 'chair switch mobile am charger cover',
 'amanda mango mango any alarm ant']

In [None]:
print('\n\n\t\t _> Spark Job complete 🤗 ✅ \n\n')