In [46]:
# Import packages.
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
import random, os, json

# Launch spark cluster. Restart cluster, if it is already started.
try:
    sc
    sc.stop()
except NameError:
    pass
finally:
    print('Spinning up Spark cluster ...')
    conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
    sc = SparkContext(conf = conf)

# Display cluster information.
sc

Spinning up Spark cluster ...


In [47]:
# Get current working directory.
current_working_directory = os.getcwd()

# Load configuration file.
with open(current_working_directory + '\configuration.json', 'r') as configuration_file:
    dict_configurations = json.load(configuration_file)

# Get path part for friends.csv file from configuration file.
friends_csv_path_part = dict_configurations['friends.csv_path_part']

# Get current working directory's parent.
current_working_directory_parent = os.path.dirname(current_working_directory)

# Get full path for u.data file.
friends_csv_path = os.path.abspath(os.path.join(current_working_directory_parent, friends_csv_path_part))

In [None]:
# Create function to parse each line of data file.
def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

parseLine('0,Will,33,385')

(33, 385)

In [None]:
lines = sc.textFile(friends_csv_path)
lines.take(10)

['0,Will,33,385',
 '1,Jean-Luc,26,2',
 '2,Hugh,55,221',
 '3,Deanna,40,465',
 '4,Quark,68,21',
 '5,Weyoun,59,318',
 '6,Gowron,37,220',
 '7,Will,54,307',
 '8,Jadzia,38,380',
 '9,Hugh,27,181']

In [None]:
rdd = lines.map(parseLine)
rdd.take(10)

In [None]:
rdd.mapValues(lambda x: (x, 1)).take(10)

**reduceByKey():** Used below to summarize the two components of value, prefixed by [0] and [1], by key.

In [None]:
rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])).take(10)

In [None]:
totalsByAge = rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
totalsByAge.take(10)

In [None]:
averagesByAge = totalsByAge.mapValues(lambda x: x[0] / x[1])
averagesByAge.take(10)

In [None]:
results = averagesByAge.collect()
for result in results:
    print(result)