# Jonathan Halverson
# Friday, February 16, 2018
# Spark 2 basics

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("NoApp").getOrCreate()

In [3]:
lines = spark.read.text('text_file.md')
print lines.count()
print lines.first()

95
Row(value=u'# Apache Spark')


In [4]:
lines

DataFrame[value: string]

In [5]:
lines.printSchema()

root
 |-- value: string (nullable = true)



In [6]:
print lines.take(5)

[Row(value=u'# Apache Spark'), Row(value=u''), Row(value=u'Spark is a fast and general cluster computing system for Big Data. It provides'), Row(value=u'high-level APIs in Scala, Java, Python, and R, and an optimized engine that'), Row(value=u'supports general computation graphs for data analysis. It also supports a')]


In [7]:
print lines.collect()

[Row(value=u'# Apache Spark'), Row(value=u''), Row(value=u'Spark is a fast and general cluster computing system for Big Data. It provides'), Row(value=u'high-level APIs in Scala, Java, Python, and R, and an optimized engine that'), Row(value=u'supports general computation graphs for data analysis. It also supports a'), Row(value=u'rich set of higher-level tools including Spark SQL for SQL and DataFrames,'), Row(value=u'MLlib for machine learning, GraphX for graph processing,'), Row(value=u'and Spark Streaming for stream processing.'), Row(value=u''), Row(value=u'<http://spark.apache.org/>'), Row(value=u''), Row(value=u''), Row(value=u'## Online Documentation'), Row(value=u''), Row(value=u'You can find the latest Spark documentation, including a programming'), Row(value=u'guide, on the [project web page](http://spark.apache.org/documentation.html)'), Row(value=u'and [project wiki](https://cwiki.apache.org/confluence/display/SPARK).'), Row(value=u'This README file only contains basic set

In [8]:
lines.sample(withReplacement=False, fraction=0.1).collect()

[Row(value=u''),
 Row(value=u'## Interactive Python Shell'),
 Row(value=u'    ./bin/pyspark'),
 Row(value=u''),
 Row(value=u'Spark also comes with several sample programs in the `examples` directory.'),
 Row(value=u''),
 Row(value=u'package. For instance:'),
 Row(value=u'["Specifying the Hadoop Version"](http://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version)'),
 Row(value=u'')]

In [9]:
plines = lines.rdd.filter(lambda row: 'Python' in row.value or 'Spark' in row.value)
print plines.count()

20


In [10]:
from pyspark.sql import functions as F

In [11]:
chars = lines.withColumn('Length', F.length(lines.value))
chars.show(5)

+--------------------+------+
|               value|Length|
+--------------------+------+
|      # Apache Spark|    14|
|                    |     0|
|Spark is a fast a...|    78|
|high-level APIs i...|    75|
|supports general ...|    73|
+--------------------+------+
only showing top 5 rows



In [12]:
chars = lines.rdd.map(lambda row: len(row.value))
print chars.take(10)

[14, 0, 78, 75, 73, 74, 56, 42, 0, 26]


In [13]:
small = spark.sparkContext.parallelize(['dog', 'fish', 'cat', 'mouse'])
small_and_keys = small.union(plines)
print small_and_keys.collect()

['dog', 'fish', 'cat', 'mouse', Row(value=u'# Apache Spark'), Row(value=u'Spark is a fast and general cluster computing system for Big Data. It provides'), Row(value=u'high-level APIs in Scala, Java, Python, and R, and an optimized engine that'), Row(value=u'rich set of higher-level tools including Spark SQL for SQL and DataFrames,'), Row(value=u'and Spark Streaming for stream processing.'), Row(value=u'You can find the latest Spark documentation, including a programming'), Row(value=u'## Building Spark'), Row(value=u'Spark is built using [Apache Maven](http://maven.apache.org/).'), Row(value=u'To build Spark and its example programs, run:'), Row(value=u'["Building Spark"](http://spark.apache.org/docs/latest/building-spark.html).'), Row(value=u'The easiest way to start using Spark is through the Scala shell:'), Row(value=u'## Interactive Python Shell'), Row(value=u'Alternatively, if you prefer Python, you can use the Python shell:'), Row(value=u'Spark also comes with several sample pro

In [14]:
small_and_ints = small.union(chars)
print small_and_ints.take(20)

['dog', 'fish', 'cat', 'mouse', 14, 0, 78, 75, 73, 74, 56, 42, 0, 26, 0, 0, 23, 0, 68, 76]


In [15]:
print chars.count(), chars.distinct().count()

95 42


In [16]:
# find the maximum
print chars.reduce(lambda x, y: x if x > y else y), chars.max()

120 120


In [17]:
print chars.collect()

[14, 0, 78, 75, 73, 74, 56, 42, 0, 26, 0, 0, 23, 0, 68, 76, 70, 56, 0, 17, 0, 62, 45, 0, 39, 0, 67, 66, 76, 0, 26, 0, 64, 0, 21, 0, 52, 0, 44, 0, 27, 0, 66, 0, 17, 0, 61, 0, 43, 0, 19, 0, 74, 74, 0, 29, 0, 32, 0, 75, 62, 41, 73, 72, 22, 0, 54, 0, 69, 0, 16, 0, 84, 17, 0, 19, 0, 33, 120, 0, 31, 0, 77, 76, 77, 0, 42, 120, 84, 65, 0, 16, 0, 97, 70]


In [18]:
pairs = lines.rdd.flatMap(lambda row: row.value.split()).map(lambda x: (x, 1))
print pairs.take(5)

[(u'#', 1), (u'Apache', 1), (u'Spark', 1), (u'Spark', 1), (u'is', 1)]


In [19]:
# note that we change types here from int to string
trans = chars.map(lambda x: 'dog' if x > 10 else 'cat')
print trans.take(5)

['dog', 'cat', 'dog', 'dog', 'dog']


In [20]:
print chars.countByValue()

defaultdict(<type 'int'>, {0: 35, 14: 1, 16: 2, 17: 3, 19: 2, 21: 1, 22: 1, 23: 1, 26: 2, 27: 1, 29: 1, 31: 1, 32: 1, 33: 1, 39: 1, 41: 1, 42: 2, 43: 1, 44: 1, 45: 1, 52: 1, 54: 1, 56: 2, 61: 1, 62: 2, 64: 1, 65: 1, 66: 2, 67: 1, 68: 1, 69: 1, 70: 2, 72: 1, 73: 2, 74: 3, 75: 2, 76: 3, 77: 2, 78: 1, 84: 2, 97: 1, 120: 2})


In [21]:
print chars.top(5)

[120, 120, 97, 84, 84]


In [22]:
from pyspark import StorageLevel

In [23]:
# note that persist does not force evaluation
chars.persist(StorageLevel(True, True, False, False, 1))

PythonRDD[31] at RDD at PythonRDD.scala:48