### Introduction to PySpark

In [1]:
sc

<pyspark.context.SparkContext at 0x7f35828c90b8>

In [5]:
sqlCtx

<pyspark.sql.context.SQLContext at 0x7f35951dacf8>

In [6]:
rdd = sc.parallelize(range(1000), 20)  
rdd.getNumPartitions()

20

In [7]:
rdd.take(5)

[0, 1, 2, 3, 4]

In [8]:
text_rdd = sc.textFile('../data/clinton_emails/Emails.csv')

In [9]:
text_rdd.filter(lambda x: 'Germany' in x)

PythonRDD[4] at RDD at PythonRDD.scala:48

In [10]:
text_rdd.filter(lambda x: 'Germany' in x).first()

'had at least the tacit support of the governments of the United States, Germany, France, and Russia.'

In [11]:
germany = text_rdd.filter(lambda x: 'Germany' in x)

In [12]:
merkel = text_rdd.filter(lambda x: 'Merkel' in x)

In [13]:
germerkel = germany.union(merkel)

In [14]:
germerkel

UnionRDD[8] at union at NativeMethodAccessorImpl.java:-2

In [15]:
germerkel.count()

514

In [16]:
germerkel = germerkel.persist()

In [17]:
nums = sc.parallelize([1, 2, 3, 4])

In [18]:
sum = nums.reduce(lambda x, y: x + y)

In [19]:
sum

10

In [20]:
sumCount = nums.aggregate((0, 0),  # zero value return
                          (lambda x, y: (x[0] + y, x[1] + 1)), # first value - kind of like a map step - creates new type
                          (lambda x, y: (x[0] + y[0], x[1] + y[1]))) 
                                    # combining vals from prev step - kinda like a reduce - combines new types

In [21]:
sumCount

(10, 4)

In [22]:
nums.first()

1

In [23]:
nums.aggregate?

In [24]:
pairs = germerkel.map(lambda s: (s, 1))
counts = pairs.reduceByKey(lambda a, b: a + b)

In [35]:
counts.take(10)

[('newspapers: the New York Times, Der Spiegel in Germany, Le Monde in France and El Pais in Spain. All five',
  1),
 ('Germany responded by reviewing all their PD operations and developing an entirely new approach based on the PD',
  2),
 ('when it once accused Germany of wanting Europe to become a large Switzerland.',
  4),
 ('governments of the United States, Germany, France, and Russia.', 1),
 ("is more about coordinating regulatory matters than trade, by the way. Raising Merkel's project",
  2),
 ("right European People's Party, which groups Angela Merkel's ruling Christian Democratic Union Party in Germany,",
  4),
 ('even mention our own targets?"" demanded a furious Angela Merkel. Australia\'s',
  1),
 ('""coalition contract"" — as happens in Germany — that would bind the Lib Dems and other smaller parties into a deal if the',
  2),
 ('2006, he said, ""We are the only country, the only people, which remain still divided. Germany reunited its',
  2),
 ('Pakistan, Britain and Ger

In [39]:
pairs = germerkel.flatMap(lambda s: s.split(' ')).map(lambda w: (w, 1)).reduceByKey(lambda a, b: a + b)

In [40]:
pairs.take(4)

[('remaining', 1), ('step', 5), ('was', 39), ('provide', 2)]

In [41]:
pairs.sortBy(lambda k: k[1]).top(10)

[('•Merkel', 2),
 ('•', 6),
 ('—A—Germany', 1),
 ('—', 14),
 ('–', 2),
 ('zu', 1),
 ('zone,', 5),
 ('youth:', 1),
 ('youth', 2),
 ('your', 4)]

In [29]:
import re

In [30]:
pairs = germerkel.flatMap(lambda s: re.findall('\w+', s)).map(lambda w: (w, 1)).reduceByKey(lambda a, b: a + b)

In [31]:
my_df = pairs.sortBy(lambda p: p[1]).collect()

In [32]:
my_df

[('remaining', 1),
 ('eastern', 1),
 ('levels', 1),
 ('Pakistan', 1),
 ('manager', 1),
 ('Rijdiger', 1),
 ('TO', 1),
 ('TBD', 1),
 ('Americans', 1),
 ('set', 1),
 ('50', 1),
 ('Walter', 1),
 ('guarantees', 1),
 ('organizational', 1),
 ('Question', 1),
 ('zu', 1),
 ('ROSCHMANN', 1),
 ('4', 1),
 ('both', 1),
 ('WE', 1),
 ('Norway', 1),
 ('Guttenberg', 1),
 ('192', 1),
 ('Hills', 1),
 ('C05764319', 1),
 ('Cheryl', 1),
 ('officer', 1),
 ('Below', 1),
 ('05T04', 1),
 ('26', 1),
 ('House', 1),
 ('128', 1),
 ('K', 1),
 ('Views', 1),
 ('beloved', 1),
 ('timetable', 1),
 ('Eikenberry', 1),
 ('Der', 1),
 ('McHaleJA', 1),
 ('Leaders', 1),
 ('Second', 1),
 ('reshaping', 1),
 ('Ukraine', 1),
 ('excerpts', 1),
 ('1950s', 1),
 ('until', 1),
 ('presidential', 1),
 ('interested', 1),
 ('engine', 1),
 ('0090', 1),
 ('RHJ', 1),
 ('couldn', 1),
 ('designated', 1),
 ('StateSeal', 1),
 ('balance', 1),
 ('formal', 1),
 ('WEOG', 1),
 ('goal', 1),
 ('Hans', 1),
 ('Patey', 1),
 ('authorities', 1),
 ('Philanthro

In [33]:
type(my_df)

list

In [34]:
my_df[-40:]

[('States', 24),
 ('Chancellor', 25),
 ('United', 26),
 ('have', 27),
 ('F', 27),
 ('German', 27),
 ('will', 28),
 ('U', 29),
 ('about', 31),
 ('not', 35),
 ('we', 36),
 ('by', 37),
 ('was', 39),
 ('from', 40),
 ('it', 40),
 ('be', 41),
 ('with', 41),
 ('on', 43),
 ('France', 45),
 ('are', 47),
 ('but', 49),
 ('its', 51),
 ('Angela', 53),
 ('has', 54),
 ('as', 55),
 ('The', 55),
 ('00', 56),
 ('Europe', 65),
 ('that', 80),
 ('for', 81),
 ('is', 114),
 ('a', 125),
 ('s', 158),
 ('Merkel', 162),
 ('in', 172),
 ('and', 197),
 ('to', 208),
 ('of', 250),
 ('the', 399),
 ('Germany', 406)]