# Spark Tutorial - Wiederholung vom 12.12.22

In [1]:
# Wir wollen DataFrames nutzen und dafür brauchen wir eine SparkSession
# Import SparkSession
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder\
    .master("local[1]")\
    .appName("Datenbanken mit Spark")\
    .getOrCreate() # getOrCreate liefert existierende Session, wenn es schon eine gibt

In [2]:
spark

In [8]:
# File einlesen
frank = spark.read.text("../frankenstein.txt")

In [11]:
# Dataframe mit einer Spalte
frank

DataFrame[value: string]

In [14]:
# Explizites Anzeigen des Schemas
frank.printSchema()

root
 |-- value: string (nullable = true)



In [9]:
frank.show()

+--------------------+
|               value|
+--------------------+
|                    |
|Project Gutenberg...|
|                    |
|This eBook is for...|
|almost no restric...|
|re-use it under t...|
|with this eBook o...|
|                    |
|                    |
| Title: Frankenstein|
|       or The Mod...|
|                    |
|Author: Mary Woll...|
|                    |
|Release Date: Jun...|
|Last updated: Jan...|
|                    |
|   Language: English|
|                    |
|Character set enc...|
+--------------------+
only showing top 20 rows



In [10]:
frank.count()

7834

In [16]:
# Array mit Tupeln
print(frank.dtypes)

[('value', 'string')]


In [21]:
# Case-Sensitive!
print(f"Ist das Element == 'string': {frank.dtypes[0][1] == 'string'}")

Ist das Element == 'string': True


In [39]:
# truncate = Abschneiden nach ~20 Zeichen
# Zeige die ersten 30 Zeilen (komplett) und deaktiviere das Abschneiden
frank.show(100,truncate=False)

+-------------------------------------------------------------------------+
|value                                                                    |
+-------------------------------------------------------------------------+
|                                                                         |
|Project Gutenberg's Frankenstein, by Mary Wollstonecraft (Godwin) Shelley|
|                                                                         |
|This eBook is for the use of anyone anywhere at no cost and with         |
|almost no restrictions whatsoever.  You may copy it, give it away or     |
|re-use it under the terms of the Project Gutenberg License included      |
|with this eBook or online at www.gutenberg.net                           |
|                                                                         |
|                                                                         |
|Title: Frankenstein                                                      |
|       or T

In [38]:
# Die Zeilen in einzelne Worte splitten
from pyspark.sql.functions import split
# select selektiert eine oder mehrere Spalten, hier eben nur eine Spalte
# alias gibt dem Ergebnis (der selektierten Spalte) einen Namen
# Die Funktion split nimmt REGEXP... split nach Leerzeichen
# Der Datentyp eines split ist ein Array!
lines = frank.select(split(frank.value, " ").alias("Zeile"))
lines.show(10, truncate=100)

+----------------------------------------------------------------------------------+
|                                                                             Zeile|
+----------------------------------------------------------------------------------+
|                                                                                []|
|[Project, Gutenberg's, Frankenstein,, by, Mary, Wollstonecraft, (Godwin), Shelley]|
|                                                                                []|
|   [This, eBook, is, for, the, use, of, anyone, anywhere, at, no, cost, and, with]|
|[almost, no, restrictions, whatsoever., , You, may, copy, it,, give, it, away, or]|
|   [re-use, it, under, the, terms, of, the, Project, Gutenberg, License, included]|
|                            [with, this, eBook, or, online, at, www.gutenberg.net]|
|                                                                                []|
|                                                                

In [33]:
lines.printSchema()

root
 |-- Zeile: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [37]:
# Eine Alternative zu alias() ist withColumnRenamed
#linesWRC = lines.withColumnRenamed("Zeile", "ZeileRenamed")
#linesWRC.show(10, truncate=100)

+----------------------------------------------------------------------------------+
|                                                                      ZeileRenamed|
+----------------------------------------------------------------------------------+
|                                                                                []|
|[Project, Gutenberg's, Frankenstein,, by, Mary, Wollstonecraft, (Godwin), Shelley]|
|                                                                                []|
|   [This, eBook, is, for, the, use, of, anyone, anywhere, at, no, cost, and, with]|
|[almost, no, restrictions, whatsoever., , You, may, copy, it,, give, it, away, or]|
|   [re-use, it, under, the, terms, of, the, Project, Gutenberg, License, included]|
|                            [with, this, eBook, or, online, at, www.gutenberg.net]|
|                                                                                []|
|                                                                

In [46]:
# Schritt 1: RegEx anpassen, dass nur Wörter beginnend mit Buchstaben ausgegeben werden.
# Zahlen werden ausgeschlossen
# Die Buchstaben können Groß- oder Kleingeschrieben sein
lines = frank.select(split(frank.value, "[^a-z,A-Z]").alias("Zeile"))
lines.show(100,truncate=False)

+-------------------------------------------------------------------------------------+
|Zeile                                                                                |
+-------------------------------------------------------------------------------------+
|[]                                                                                   |
|[Project, Gutenberg, s, Frankenstein,, by, Mary, Wollstonecraft, , Godwin, , Shelley]|
|[]                                                                                   |
|[This, eBook, is, for, the, use, of, anyone, anywhere, at, no, cost, and, with]      |
|[almost, no, restrictions, whatsoever, , , You, may, copy, it,, give, it, away, or]  |
|[re, use, it, under, the, terms, of, the, Project, Gutenberg, License, included]     |
|[with, this, eBook, or, online, at, www, gutenberg, net]                             |
|[]                                                                                   |
|[]                             

In [49]:
# Spalte selektieren auf diverse Arten und Ausgabe als Dataframe
lines.select(lines.Zeile).show()
lines.select("Zeile").show()

from pyspark.sql.functions import col
lines.select(col("Zeile")).show()

#Ausgabe von Column, statt Dataframe
#lines["Zeile"].show()
#Spalte ntuzen, um dann eine Spalte im Dataframe auszuwählen
lines.select(lines["Zeile"]).show()

# ...

+--------------------+
|               Zeile|
+--------------------+
|                  []|
|[Project, Gutenbe...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re, use, it, und...|
|[with, this, eBoo...|
|                  []|
|                  []|
|[Title, , Franken...|
|[, , , , , , , or...|
|                  []|
|[Author, , Mary, ...|
|                  []|
|[Release, Date, ,...|
|[Last, updated, ,...|
|                  []|
|[Language, , Engl...|
|                  []|
|[Character, set, ...|
+--------------------+
only showing top 20 rows

+--------------------+
|               Zeile|
+--------------------+
|                  []|
|[Project, Gutenbe...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re, use, it, und...|
|[with, this, eBoo...|
|                  []|
|                  []|
|[Title, , Franken...|
|[, , , , , , , or...|
|                  []|
|[Author, , Mary, ...|
|                  []|
|[Release, Date, ,...|
|[Last, 

## Exploding list of words into ROWS (nicht COLS)

In [73]:
# Spark explode = Jedes Element wird zu einer neuen Reihe gemacht
# Wieso explode statt Python-Code? - Automatische Zuteilung auf Cluster
from pyspark.sql.functions import explode, col
words = frank.select(explode(split(frank.value, "[^a-z,A-Z]")).alias("words"))
words.show(100,truncate=False)

+--------------+
|words         |
+--------------+
|              |
|Project       |
|Gutenberg     |
|s             |
|Frankenstein, |
|by            |
|Mary          |
|Wollstonecraft|
|              |
|Godwin        |
|              |
|Shelley       |
|              |
|This          |
|eBook         |
|is            |
|for           |
|the           |
|use           |
|of            |
|anyone        |
|anywhere      |
|at            |
|no            |
|cost          |
|and           |
|with          |
|almost        |
|no            |
|restrictions  |
|whatsoever    |
|              |
|              |
|You           |
|may           |
|copy          |
|it,           |
|give          |
|it            |
|away          |
|or            |
|re            |
|use           |
|it            |
|under         |
|the           |
|terms         |
|of            |
|the           |
|Project       |
|Gutenberg     |
|License       |
|included      |
|with          |
|this          |
|eBook        

In [74]:
from pyspark.sql.functions import lower
words_lower = words.select(lower(col("words")).alias("words_lower"))
words_lower.show(100,truncate=False)

+--------------+
|words_lower   |
+--------------+
|              |
|project       |
|gutenberg     |
|s             |
|frankenstein, |
|by            |
|mary          |
|wollstonecraft|
|              |
|godwin        |
|              |
|shelley       |
|              |
|this          |
|ebook         |
|is            |
|for           |
|the           |
|use           |
|of            |
|anyone        |
|anywhere      |
|at            |
|no            |
|cost          |
|and           |
|with          |
|almost        |
|no            |
|restrictions  |
|whatsoever    |
|              |
|              |
|you           |
|may           |
|copy          |
|it,           |
|give          |
|it            |
|away          |
|or            |
|re            |
|use           |
|it            |
|under         |
|the           |
|terms         |
|of            |
|the           |
|project       |
|gutenberg     |
|license       |
|included      |
|with          |
|this          |
|ebook        

In [77]:
from pyspark.sql.functions import regexp_extract
words_clean = words_lower.select(regexp_extract("words_lower", "[a-zA-z]{2,}|a|i", 0).alias("words_nonempty"))
words_clean.show()

+--------------+
|words_nonempty|
+--------------+
|              |
|       project|
|     gutenberg|
|              |
|  frankenstein|
|            by|
|          mary|
|wollstonecraft|
|              |
|        godwin|
|              |
|       shelley|
|              |
|          this|
|         ebook|
|            is|
|           for|
|           the|
|           use|
|            of|
+--------------+
only showing top 20 rows



In [115]:
words_nonempty = words_clean.filter(col("words_nonempty") != "")
words_nonempty.show()

+--------------+
|words_nonempty|
+--------------+
|       project|
|     gutenberg|
|  frankenstein|
|            by|
|          mary|
|wollstonecraft|
|        godwin|
|       shelley|
|          this|
|         ebook|
|            is|
|           for|
|           the|
|           use|
|            of|
|        anyone|
|      anywhere|
|            at|
|            no|
|          cost|
+--------------+
only showing top 20 rows



In [90]:
# WHERE == filter()
proper_words_where = words_clean.where(col("words_nonempty") != "")
proper_words_where.show(20)

+--------------+
|words_nonempty|
+--------------+
|       project|
|     gutenberg|
|  frankenstein|
|            by|
|          mary|
|wollstonecraft|
|        godwin|
|       shelley|
|          this|
|         ebook|
|            is|
|           for|
|           the|
|           use|
|            of|
|        anyone|
|      anywhere|
|            at|
|            no|
|          cost|
+--------------+
only showing top 20 rows



In [88]:
# Entfernt das Wort, welches mit 'any' beginnt
proper_words_any = words_clean.filter(col("words_nonempty") != "any*")
proper_words_any.show(20)

+--------------+
|words_nonempty|
+--------------+
|              |
|       project|
|     gutenberg|
|              |
|  frankenstein|
|            by|
|          mary|
|wollstonecraft|
|              |
|        godwin|
|              |
|       shelley|
|              |
|          this|
|         ebook|
|            is|
|           for|
|           the|
|           use|
|            of|
+--------------+
only showing top 20 rows



## Die große Frage: In welcher Reihenfolge filtern? - Wie crazy muss die Regex sein?
Antwort: Lesbarkeit geht vor!</br>Spark evaluiert lazy und kann "hintendran" sehr viel optimieren.</br>Es gibt keine Reihenfolge, da man es so durchführen sollte, wie man es selbst am Besten versteht.

## Aufgabe 1
Das Wort "is" soll aus dem gesamten Text entfernt werden und Wörter der mindestlänge von 3 sollen ausgegeben werden

In [94]:
words_short = words_nonempty.where(col("words_nonempty") != "is")
words_short.show(20)

+--------------+
|words_nonempty|
+--------------+
|       project|
|     gutenberg|
|  frankenstein|
|            by|
|          mary|
|wollstonecraft|
|        godwin|
|       shelley|
|          this|
|         ebook|
|           for|
|           the|
|           use|
|            of|
|        anyone|
|      anywhere|
|            at|
|            no|
|          cost|
|           and|
+--------------+
only showing top 20 rows



In [92]:
from pyspark.sql.functions import length
min3Zeichen = words_nonempty.where(length(col("words_nonempty"))>3)
min3Zeichen.show(20)

+--------------+
|words_nonempty|
+--------------+
|       project|
|     gutenberg|
|  frankenstein|
|          mary|
|wollstonecraft|
|        godwin|
|       shelley|
|          this|
|         ebook|
|        anyone|
|      anywhere|
|          cost|
|          with|
|        almost|
|  restrictions|
|    whatsoever|
|          copy|
|          give|
|          away|
|         under|
+--------------+
only showing top 20 rows



## Aufgabe 2
Finde programmatisch heraus, wie viele Spalten **keine** Strings sind.

In [97]:
datenA2 = spark.createDataFrame([["test", "noch ein test", 10_000_000_000]],["1","2","3"])
datenA2.printSchema()

root
 |-- 1: string (nullable = true)
 |-- 2: string (nullable = true)
 |-- 3: long (nullable = true)



In [98]:
cnt = 0
for x, y in datenA2.dtypes:
    if y != "string":
        cnt += 1
print(f"cnt = {cnt}")

cnt = 1


## Aufgabe 3
Mache den Code lesbarer!

In [99]:
# Ausgangscode
datenA3 = spark.read.text("../frankenstein.txt").select(length(col("value"))).withColumnRenamed("length(value)", "numChar")

In [101]:
# Austauschen von withColumnRenamed mit alias()
datenA3 = spark.read.text("../frankenstein.txt").select(length(col("value"))).alias("numChar")

## Aufgabe 4
Im folgenden Code gibt es ein Problem. Was ist es und wie kann man es reparieren?

In [111]:
datenA4 = spark.createDataFrame([["key", 20_000_000, 10_000_000_000]], ["key", "value1", "value2"])
datenA4.printSchema()

root
 |-- key: string (nullable = true)
 |-- value1: long (nullable = true)
 |-- value2: long (nullable = true)



In [113]:
from pyspark.sql.functions import greatest

datenA4M = datenA4.select(greatest(col("value1"), col("value2")).alias("maxVal")).select("key", "maxVal")
datenA4M.show()

AnalysisException: Column 'key' does not exist. Did you mean one of the following? [maxVal];
'Project ['key, maxVal#489L]
+- Project [greatest(value1#470L, value2#471L) AS maxVal#489L]
   +- LogicalRDD [key#469, value1#470L, value2#471L], false


In [112]:
from pyspark.sql.functions import greatest

datenA4M = datenA4.select(col("key"),greatest(col("value1"), col("value2")).alias("maxVal")).select("key", "maxVal")
datenA4M.show()

# Das Problem bestand darin, dass ein select auf den "key" getätigt wurde, allerdings diese Spalte noch nicht existierte

+---+-----------+
|key|     maxVal|
+---+-----------+
|key|10000000000|
+---+-----------+



## Aufgabe 5
Filtere mit Hilfe der Funktion *isin* die Wörter *is*, *not*, *if* und *the* aus dem Text

In [123]:
filter_words = ["is", "not", "if", "the"]
words_nonempty.filter(~words_nonempty.words_nonempty.isin(filter_words)).show()
#words_nonempty.filter(words_nonempty.words_nonempty.isin(filter_words)).show() = Gibt alle filter_words aus

+--------------+
|words_nonempty|
+--------------+
|       project|
|     gutenberg|
|  frankenstein|
|            by|
|          mary|
|wollstonecraft|
|        godwin|
|       shelley|
|          this|
|         ebook|
|           for|
|           use|
|            of|
|        anyone|
|      anywhere|
|            at|
|            no|
|          cost|
|           and|
|          with|
+--------------+
only showing top 20 rows



In [127]:
df_filter_words = spark.createDataFrame([["is"], ["not"], ["if"], ["the"]],["filter word"])
df_filter_words.show()

#df2_filter_words = spark.createDataFrame([["is", "not", "if", "the"]],["filter word"])
#df2_filter_words.show()

+-----------+
|filter word|
+-----------+
|         is|
|        not|
|         if|
|        the|
+-----------+



## Aufgabe 6
Finde den / die Fehler im folgenden Code und repariere ihn so, dass der Code so wie erwartet funktioniert

In [133]:
from pyspark.sql.functions import col, split
try:
    book = spark.read.text("frankenstein.txt")
    book = book.printSchema() # Überschreiben der Variable book -> Select kann auf das Schema nicht ausgeführt werden!
    lines= book.select(spolit(book.value, " ").alias("lime")) # spolit = split
    words = lines.select(explode(col("line")).alias("word")) # Spalte "line" existiert nicht
except AnalysisException as err:
    print(err)
words.show()

Path does not exist: file:/home/jovyan/spark/frankenstein.txt
+--------------+
|          word|
+--------------+
|              |
|       Project|
|   Gutenberg's|
| Frankenstein,|
|            by|
|          Mary|
|Wollstonecraft|
|      (Godwin)|
|       Shelley|
|              |
|          This|
|         eBook|
|            is|
|           for|
|           the|
|           use|
|            of|
|        anyone|
|      anywhere|
|            at|
+--------------+
only showing top 20 rows



In [132]:
from pyspark.sql.functions import col, split
try:
    text = spark.read.text("../frankenstein.txt")
    book = text.printSchema() 
    lines= text.select(split(text.value, " ").alias("lime"))
    words = lines.select(explode(col("lime")).alias("word"))
except AnalysisException as err:
    print(err)
words.show()

root
 |-- value: string (nullable = true)

+--------------+
|          word|
+--------------+
|              |
|       Project|
|   Gutenberg's|
| Frankenstein,|
|            by|
|          Mary|
|Wollstonecraft|
|      (Godwin)|
|       Shelley|
|              |
|          This|
|         eBook|
|            is|
|           for|
|           the|
|           use|
|            of|
|        anyone|
|      anywhere|
|            at|
+--------------+
only showing top 20 rows

