### Compute the average number of friends by age.###

In [46]:
# Import packages.
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
import random, os, json

# Launch spark cluster. Restart cluster, if it is already started.
try:
    sc
    sc.stop()
except NameError:
    pass
finally:
    print('Spinning up Spark cluster ...')
    conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
    sc = SparkContext(conf = conf)

# Display cluster information.
sc

Spinning up Spark cluster ...


In [47]:
# Get current working directory.
current_working_directory = os.getcwd()

# Load configuration file.
with open(current_working_directory + '\configuration.json', 'r') as configuration_file:
    dict_configurations = json.load(configuration_file)

# Get path part for friends.csv file from configuration file.
friends_csv_path_part = dict_configurations['friends.csv_path_part']

# Get current working directory's parent.
current_working_directory_parent = os.path.dirname(current_working_directory)

# Get full path for u.data file.
friends_csv_path = os.path.abspath(os.path.join(current_working_directory_parent, friends_csv_path_part))

In [48]:
# Create function to parse each line of data file.
def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

parseLine('0,Will,33,385')

(33, 385)

In [49]:
lines = sc.textFile(friends_csv_path)
lines.take(10)

['0,Will,33,385',
 '1,Jean-Luc,26,2',
 '2,Hugh,55,221',
 '3,Deanna,40,465',
 '4,Quark,68,21',
 '5,Weyoun,59,318',
 '6,Gowron,37,220',
 '7,Will,54,307',
 '8,Jadzia,38,380',
 '9,Hugh,27,181']

In [50]:
rdd = lines.map(parseLine)
rdd.take(10)

[(33, 385),
 (26, 2),
 (55, 221),
 (40, 465),
 (68, 21),
 (59, 318),
 (37, 220),
 (54, 307),
 (38, 380),
 (27, 181)]

In [51]:
rdd.mapValues(lambda x: (x, 1)).take(10)

[(33, (385, 1)),
 (26, (2, 1)),
 (55, (221, 1)),
 (40, (465, 1)),
 (68, (21, 1)),
 (59, (318, 1)),
 (37, (220, 1)),
 (54, (307, 1)),
 (38, (380, 1)),
 (27, (181, 1))]

**reduceByKey():** Used below to summarize the two components of value, prefixed by [0] and [1], by key.

In [52]:
rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])).take(10)

[(18, (2747, 8)),
 (19, (2346, 11)),
 (20, (825, 5)),
 (21, (2807, 8)),
 (22, (1445, 7)),
 (23, (2463, 10)),
 (24, (1169, 5)),
 (25, (2172, 11)),
 (26, (4115, 17)),
 (27, (1825, 8))]

In [53]:
totalsByAge = rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
totalsByAge.take(10)

[(18, (2747, 8)),
 (19, (2346, 11)),
 (20, (825, 5)),
 (21, (2807, 8)),
 (22, (1445, 7)),
 (23, (2463, 10)),
 (24, (1169, 5)),
 (25, (2172, 11)),
 (26, (4115, 17)),
 (27, (1825, 8))]

In [54]:
averagesByAge = totalsByAge.mapValues(lambda x: x[0] / x[1])
averagesByAge.take(10)

[(18, 343.375),
 (19, 213.27272727272728),
 (20, 165.0),
 (21, 350.875),
 (22, 206.42857142857142),
 (23, 246.3),
 (24, 233.8),
 (25, 197.45454545454547),
 (26, 242.05882352941177),
 (27, 228.125)]

In [55]:
results = averagesByAge.collect()
for result in results:
    print(result)

(18, 343.375)
(19, 213.27272727272728)
(20, 165.0)
(21, 350.875)
(22, 206.42857142857142)
(23, 246.3)
(24, 233.8)
(25, 197.45454545454547)
(26, 242.05882352941177)
(27, 228.125)
(28, 209.1)
(29, 215.91666666666666)
(30, 235.8181818181818)
(31, 267.25)
(32, 207.9090909090909)
(33, 325.3333333333333)
(34, 245.5)
(35, 211.625)
(36, 246.6)
(37, 249.33333333333334)
(38, 193.53333333333333)
(39, 169.28571428571428)
(40, 250.8235294117647)
(41, 268.55555555555554)
(42, 303.5)
(43, 230.57142857142858)
(44, 282.1666666666667)
(45, 309.53846153846155)
(46, 223.69230769230768)
(47, 233.22222222222223)
(48, 281.4)
(49, 184.66666666666666)
(50, 254.6)
(51, 302.14285714285717)
(52, 340.6363636363636)
(53, 222.85714285714286)
(54, 278.0769230769231)
(55, 295.53846153846155)
(56, 306.6666666666667)
(57, 258.8333333333333)
(58, 116.54545454545455)
(59, 220.0)
(60, 202.71428571428572)
(61, 256.22222222222223)
(62, 220.76923076923077)
(63, 384.0)
(64, 281.3333333333333)
(65, 298.2)
(66, 276.4444444444444