## Import libraries and modules

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import transform, col, concat, lit

## Create a spark session object

In [3]:
spark = (SparkSession.builder
         .appName("basic-transformation")
         .master("spark://spark-master:7077")
         .config("spark.executor.memory", "512m")
         .getOrCreate()
        )

spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/05 13:57:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Read the data

In [4]:
df = (spark.read.format("json")
      .option("multiLine", "true")
      .load("../../data/nobel_prizes.json"))


In [5]:
df.show()

+----------+--------------------+--------------------+----+
|  category|           laureates|   overallMotivation|year|
+----------+--------------------+--------------------+----+
| chemistry|[{Carolyn, 1015, ...|                null|2022|
| economics|[{Ben, 1021, "for...|                null|2022|
|literature|[{Annie, 1017, "f...|                null|2022|
|     peace|[{Ales, 1018, "Th...|                null|2022|
|   physics|[{Alain, 1012, "f...|                null|2022|
|  medicine|[{Svante, 1011, "...|                null|2022|
| chemistry|[{Benjamin, 1002,...|                null|2021|
| economics|[{David, 1007, "f...|                null|2021|
|literature|[{Abdulrazak, 100...|                null|2021|
|     peace|[{Maria, 1005, "f...|                null|2021|
|   physics|[{Syukuro, 999, "...|"for groundbreaki...|2021|
|  medicine|[{David, 997, "fo...|                null|2021|
| chemistry|[{Emmanuelle, 991...|                null|2020|
| economics|[{Paul, 995, "for...|       

In [6]:
df.printSchema()

root
 |-- category: string (nullable = true)
 |-- laureates: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- firstname: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- motivation: string (nullable = true)
 |    |    |-- share: string (nullable = true)
 |    |    |-- surname: string (nullable = true)
 |-- overallMotivation: string (nullable = true)
 |-- year: string (nullable = true)



## Apply transform function to laureates column and create a new column - laureates_full_name

In [9]:
df_transformed = (df.select("category", "overallMotivation", "year", "laureates",
                            transform(col("laureates"), lambda x: concat(x.firstname, 
                                                                         lit(" "),
                                                                         x.surname))
                            .alias("laureates_full_name")))

df_transformed.show(5, truncate=True)

+----------+-----------------+----+--------------------+--------------------+
|  category|overallMotivation|year|           laureates| laureates_full_name|
+----------+-----------------+----+--------------------+--------------------+
| chemistry|             null|2022|[{Carolyn, 1015, ...|[Carolyn Bertozzi...|
| economics|             null|2022|[{Ben, 1021, "for...|[Ben Bernanke, Do...|
|literature|             null|2022|[{Annie, 1017, "f...|      [Annie Ernaux]|
|     peace|             null|2022|[{Ales, 1018, "Th...|[Ales Bialiatski ...|
|   physics|             null|2022|[{Alain, 1012, "f...|[Alain Aspect, nu...|
+----------+-----------------+----+--------------------+--------------------+
only showing top 5 rows



In [14]:
df_transformed.select('laureates_full_name').show(truncate=False)

+--------------------------------------------------------+
|laureates_full_name                                     |
+--------------------------------------------------------+
|[Carolyn Bertozzi, Morten Meldal, Barry Sharpless]      |
|[Ben Bernanke, Douglas Diamond, Philip Dybvig]          |
|[Annie Ernaux]                                          |
|[Ales Bialiatski , null, null]                          |
|[Alain Aspect, null, Anton Zeilinger]                   |
|[Svante Pääbo]                                          |
|[Benjamin List, David MacMillan]                        |
|[David Card, Joshua Angrist, Guido Imbens]              |
|[Abdulrazak Gurnah]                                     |
|[Maria Ressa, Dmitry Muratov]                           |
|[Syukuro Manabe, Klaus Hasselmann, Giorgio Parisi]      |
|[David Julius, Ardem Patapoutian]                       |
|[Emmanuelle Charpentier, Jennifer A. Doudna]            |
|[Paul Milgrom, Robert Wilson]                          

In [15]:
df_transformed.select('laureates').show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Drop duplicates in the dataframe

In [16]:
# Getting the shape of the dataframe

num_rows = df.count()
num_cols = len(df.columns)
print(f"DataFrame shape: ({num_rows}, {num_cols})")

DataFrame shape: (664, 4)


In [17]:
df_deduped = df.dropDuplicates(["category", "overallMotivation", "year"])

# Getting the shape of the dataframe

num_rows = df_deduped.count()
num_cols = len(df_deduped.columns)
print(f"DataFrame shape: ({num_rows}, {num_cols})")

DataFrame shape: (664, 4)


### Dropping duplicates based on category column

In [18]:
df_deduped_2 = df.dropDuplicates(["category"])

num_rows = df_deduped_2.count()
num_cols = len(df_deduped_2.columns)
print(f"DataFrame shape: ({num_rows}, {num_cols})")

DataFrame shape: (6, 4)


In [19]:
df_deduped_2.show()

+----------+--------------------+-----------------+----+
|  category|           laureates|overallMotivation|year|
+----------+--------------------+-----------------+----+
| chemistry|[{Carolyn, 1015, ...|             null|2022|
| economics|[{Ben, 1021, "for...|             null|2022|
|literature|[{Annie, 1017, "f...|             null|2022|
|  medicine|[{Svante, 1011, "...|             null|2022|
|     peace|[{Ales, 1018, "Th...|             null|2022|
|   physics|[{Alain, 1012, "f...|             null|2022|
+----------+--------------------+-----------------+----+



## Sort Dataframe using orderBy function

In [24]:
df_sorted = df.orderBy("year")

df_sorted.show()

+----------+--------------------+-----------------+----+
|  category|           laureates|overallMotivation|year|
+----------+--------------------+-----------------+----+
| chemistry|[{Jacobus H., 160...|             null|1901|
|literature|[{Sully, 569, "in...|             null|1901|
|     peace|[{Henry, 462, "fo...|             null|1901|
|   physics|[{Wilhelm Conrad,...|             null|1901|
|  medicine|[{Emil, 293, "for...|             null|1901|
| chemistry|[{Emil, 161, "in ...|             null|1902|
|literature|[{Theodor, 571, "...|             null|1902|
|     peace|[{Élie, 464, "for...|             null|1902|
|   physics|[{Hendrik A., 2, ...|             null|1902|
|  medicine|[{Ronald, 294, "f...|             null|1902|
|literature|[{Bjørnstjerne, 5...|             null|1903|
| chemistry|[{Svante, 162, "i...|             null|1903|
|     peace|[{Randal, 466, "f...|             null|1903|
|   physics|[{Henri, 4, "in r...|             null|1903|
|  medicine|[{Niels Ryberg, 2..

### Sort by Multiple columns

In [25]:
df_sorted = df.orderBy(["year", "category"], ascending = [False, True])

df_sorted.show()

+----------+--------------------+--------------------+----+
|  category|           laureates|   overallMotivation|year|
+----------+--------------------+--------------------+----+
| chemistry|[{Carolyn, 1015, ...|                null|2022|
| economics|[{Ben, 1021, "for...|                null|2022|
|literature|[{Annie, 1017, "f...|                null|2022|
|  medicine|[{Svante, 1011, "...|                null|2022|
|     peace|[{Ales, 1018, "Th...|                null|2022|
|   physics|[{Alain, 1012, "f...|                null|2022|
| chemistry|[{Benjamin, 1002,...|                null|2021|
| economics|[{David, 1007, "f...|                null|2021|
|literature|[{Abdulrazak, 100...|                null|2021|
|  medicine|[{David, 997, "fo...|                null|2021|
|     peace|[{Maria, 1005, "f...|                null|2021|
|   physics|[{Syukuro, 999, "...|"for groundbreaki...|2021|
| chemistry|[{Emmanuelle, 991...|                null|2020|
| economics|[{Paul, 995, "for...|       

## Using sort function

In [26]:
df_sorted = df.sort(['year', 'category'], ascending = [True,False])

df_sorted.show()

+----------+--------------------+-----------------+----+
|  category|           laureates|overallMotivation|year|
+----------+--------------------+-----------------+----+
|   physics|[{Wilhelm Conrad,...|             null|1901|
|     peace|[{Henry, 462, "fo...|             null|1901|
|  medicine|[{Emil, 293, "for...|             null|1901|
|literature|[{Sully, 569, "in...|             null|1901|
| chemistry|[{Jacobus H., 160...|             null|1901|
|   physics|[{Hendrik A., 2, ...|             null|1902|
|     peace|[{Élie, 464, "for...|             null|1902|
|  medicine|[{Ronald, 294, "f...|             null|1902|
|literature|[{Theodor, 571, "...|             null|1902|
| chemistry|[{Emil, 161, "in ...|             null|1902|
|   physics|[{Henri, 4, "in r...|             null|1903|
|     peace|[{Randal, 466, "f...|             null|1903|
|  medicine|[{Niels Ryberg, 2...|             null|1903|
|literature|[{Bjørnstjerne, 5...|             null|1903|
| chemistry|[{Svante, 162, "i..

## Rename columns

In [27]:
df_renamed = df.withColumnRenamed("category", "Topic")

df_renamed.show()

+----------+--------------------+--------------------+----+
|     Topic|           laureates|   overallMotivation|year|
+----------+--------------------+--------------------+----+
| chemistry|[{Carolyn, 1015, ...|                null|2022|
| economics|[{Ben, 1021, "for...|                null|2022|
|literature|[{Annie, 1017, "f...|                null|2022|
|     peace|[{Ales, 1018, "Th...|                null|2022|
|   physics|[{Alain, 1012, "f...|                null|2022|
|  medicine|[{Svante, 1011, "...|                null|2022|
| chemistry|[{Benjamin, 1002,...|                null|2021|
| economics|[{David, 1007, "f...|                null|2021|
|literature|[{Abdulrazak, 100...|                null|2021|
|     peace|[{Maria, 1005, "f...|                null|2021|
|   physics|[{Syukuro, 999, "...|"for groundbreaki...|2021|
|  medicine|[{David, 997, "fo...|                null|2021|
| chemistry|[{Emmanuelle, 991...|                null|2020|
| economics|[{Paul, 995, "for...|       

## Rename multiple columns at once

In [28]:
df_renamed  = (df.selectExpr("category as Topic", 
                             "year as Year_received",
                             "overallMotivation as Motivation"))

df_renamed.show()

+----------+-------------+--------------------+
|     Topic|Year_received|          Motivation|
+----------+-------------+--------------------+
| chemistry|         2022|                null|
| economics|         2022|                null|
|literature|         2022|                null|
|     peace|         2022|                null|
|   physics|         2022|                null|
|  medicine|         2022|                null|
| chemistry|         2021|                null|
| economics|         2021|                null|
|literature|         2021|                null|
|     peace|         2021|                null|
|   physics|         2021|"for groundbreaki...|
|  medicine|         2021|                null|
| chemistry|         2020|                null|
| economics|         2020|                null|
|literature|         2020|                null|
|     peace|         2020|                null|
|   physics|         2020|                null|
|  medicine|         2020|              

In [29]:
spark.stop()