In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder \
    .appName('spark-workshop exercises') \
    .getOrCreate()

spark

22/12/06 15:21:50 WARN Utils: Your hostname, karlos-300E5M-300E5L resolves to a loopback address: 127.0.1.1; using 10.0.0.89 instead (on interface wlp2s0)
22/12/06 15:21:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/06 15:21:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Spark SQL

#### Exercise 15: Finding Most Populated Cities Per Country

In [12]:
data = spark.read.csv(
    './data/spark-sql-15-input.csv',
    header=True,
    inferSchema=True
)

data

DataFrame[name: string, country: string, population: string]

In [13]:
data.show()

+-----------------+-------------+----------+
|             name|      country|population|
+-----------------+-------------+----------+
|           Warsaw|       Poland| 1 764 615|
|           Cracow|       Poland|   769 498|
|            Paris|       France| 2 206 488|
|Villeneuve-Loubet|       France|    15 020|
|    Pittsburgh PA|United States|   302 407|
|       Chicago IL|United States| 2 716 000|
|     Milwaukee WI|United States|   595 351|
|          Vilnius|    Lithuania|   580 020|
|        Stockholm|       Sweden|   972 647|
|         Goteborg|       Sweden|   580 020|
+-----------------+-------------+----------+



In [14]:
# Remove the whitespaces in the values of population and convert
# them to int.

data = data.withColumn(
    'population',
    F.regexp_replace('population', r'\s', '').cast('int')
)
data

DataFrame[name: string, country: string, population: int]

In [15]:
data.show()

+-----------------+-------------+----------+
|             name|      country|population|
+-----------------+-------------+----------+
|           Warsaw|       Poland|   1764615|
|           Cracow|       Poland|    769498|
|            Paris|       France|   2206488|
|Villeneuve-Loubet|       France|     15020|
|    Pittsburgh PA|United States|    302407|
|       Chicago IL|United States|   2716000|
|     Milwaukee WI|United States|    595351|
|          Vilnius|    Lithuania|    580020|
|        Stockholm|       Sweden|    972647|
|         Goteborg|       Sweden|    580020|
+-----------------+-------------+----------+



In [22]:
max_population_by_country = data.groupBy('country') \
    .agg(F.max('population').alias('max_population'))

max_population_by_country.show()

+-------------+--------------+
|      country|max_population|
+-------------+--------------+
|       Sweden|        972647|
|       France|       2206488|
|United States|       2716000|
|    Lithuania|        580020|
|       Poland|       1764615|
+-------------+--------------+



In [37]:
data.join(
    max_population_by_country,
    (data['country'] == max_population_by_country['country'])
    & (data['population'] == max_population_by_country['max_population'])
).drop('max_population') \
    .drop(max_population_by_country['country']) \
    .show(truncate=False)

+----------+----------+-------------+
|name      |population|country      |
+----------+----------+-------------+
|Warsaw    |1764615   |Poland       |
|Paris     |2206488   |France       |
|Chicago IL|2716000   |United States|
|Vilnius   |580020    |Lithuania    |
|Stockholm |972647    |Sweden       |
+----------+----------+-------------+



In [38]:
spark.stop()