# Spark Session

All distributed operations with Spark are done using so-called Spark Session. Usually one is already created by your cluster's administrator.

In [1]:
# we need to set this environment variable to make Spark happy
import os

os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

In [2]:
from pyspark.sql.session import SparkSession

spark = (
    SparkSession.builder
    # by deafaut it's only 1GB
    .config("spark.driver.memory", "2G")
    .getOrCreate()
)
spark

22/06/03 19:20:03 WARN Utils: Your hostname, boris-work-laptop resolves to a loopback address: 127.0.1.1; using 192.168.1.2 instead (on interface wlp0s20f3)
22/06/03 19:20:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/03 19:20:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Wasabi dataset

For this course, we will use [Wasabi dataset](https://doi.org/10.5281/zenodo.5603369), the same you have in the data visualisation course.

In [3]:
# we download the data this way
# !wget https://zenodo.org/record/5603369/files/wasabi-2-0.tar

In [4]:
song_path = os.path.join(
    os.environ["WORK"], "data", "wasabi", "json", "song.json"
)

In [5]:
# notice that song.json is one large JSON object (a list of dictionaries)
# this is not particularly distributed-computations-friendly
# also notice that different songs are separated by a line containing only
# three symbols: },{
!head -500 {song_path}

[{
  "_id": {
    "$oid": "5714dec325ac0d8aee3804e7"
  },
  "position": 0,
  "lengthAlbum": "57:52",
  "urlSong": "http://lyrics.wikia.com/A:Turn_It_Up",
  "lyrics": "Turn it up<br>I don&apos;t know where you&apos;re plugging in<br>Listen up<br>Yeah, we&apos;re all set to begin<br>Turn it up<br>Because we hope you like it loud<br>Tune it up<br>Any second now<br>Making like<br>Cheeky Monkey grins<br>And for tonight<br>I&apos;d love to let you in",
  "urlWikipedia": "",
  "id_album": {
    "$oid": "5714debb25ac0d8aee34d59a"
  },
  "isClassic": false,
  "urlAllmusic": "http://www.allmusic.com/song/mt0013320473",
  "urlMusicBrainz": "http://musicbrainz.org/recording/3db608e4-eb72-437e-bc52-872f0a51434f",
  "title": "Turn It Up",
  "publicationDateAlbum": "1997",
  "albumTitle": "How Ace Are Buildings",
  "deezer_mapping": [
    [
      67354194,
      "search-exact"
    ]
  ],
  "id_song_deezer": "67354194",
  "isrc": "GBAAP9700050",
  "length": "93",
  "explicitLyrics": false,
  "rank": 2

In [6]:
from pyspark import pandas as pd

# by default, Pandas API creates a local index
pd.set_option("compute.default_index_type", "distributed")

In [7]:
# Spark can read simple text files and split them into lines
# using any character sequence you wish

song_lines = spark.read.text(
    song_path,
    lineSep="\n},{\n"
).to_pandas_on_spark()

In [8]:
# now we have one JSON per line

song_lines.head()

Unnamed: 0,value
0,"[{\n ""_id"": {\n ""$oid"": ""5714dec325ac0d8ae..."
1,"""_id"": {\n ""$oid"": ""5714dec325ac0d8aee380..."
2,"""_id"": {\n ""$oid"": ""5714dec325ac0d8aee380..."
3,"""_id"": {\n ""$oid"": ""5714dec325ac0d8aee380..."
4,"""_id"": {\n ""$oid"": ""5714dec325ac0d8aee380..."


In [9]:
# and around 2.1M songs in total

song_lines.count()

                                                                                

value    2079510
dtype: int64

In [10]:
# notice that ``song_lines`` is a real ``pandas.DataFrame``

type(song_lines)

pyspark.pandas.frame.DataFrame

In [11]:
# it's only a wrapper around a Spark DataFrame

print(song_lines.spark.frame())
type(song_lines.spark.frame())

DataFrame[value: string]


pyspark.sql.dataframe.DataFrame

In [12]:
# which in turn is a wrapper around something called RDD
# (Resilient Distributed Dataset)

print(song_lines.spark.frame().rdd)
type(song_lines.spark.frame().rdd)

MapPartitionsRDD[22] at javaToPython at NativeMethodAccessorImpl.java:0


pyspark.rdd.RDD

In [13]:
# RDD is distributed
# here it was spread over several partitions of roughly the same size

song_lines.spark.frame().rdd.getNumPartitions()

97

In [14]:
# RDD is resilient which means it's not data
# it's only a sequence of instructions, so-called lineage

song_lines.spark.explain()

== Physical Plan ==
*(1) Project [monotonically_increasing_id() AS __index_level_0__#2L, value#0]
+- FileScan text [value#0] Batched: false, DataFilters: [], Format: Text, Location: InMemoryFileIndex(1 paths)[file:/home/boris/data/wasabi/json/song.json], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<value:string>




In [15]:
# ``to_pandas`` method collects the data from all workers to driver
# usually that will kill the driver by OOM (out-of-memory error)
# but if we take only the head, that's OK

pandas_df = song_lines.head().to_pandas()
print(type(pandas_df))
pandas_df

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,value
0,"[{\n ""_id"": {\n ""$oid"": ""5714dec325ac0d8ae..."
1,"""_id"": {\n ""$oid"": ""5714dec325ac0d8aee380..."
2,"""_id"": {\n ""$oid"": ""5714dec325ac0d8aee380..."
3,"""_id"": {\n ""$oid"": ""5714dec325ac0d8aee380..."
4,"""_id"": {\n ""$oid"": ""5714dec325ac0d8aee380..."


In [16]:
def to_a_valid_json(
    pandas_df: pd.DataFrame["value": str]
) -> pd.DataFrame["value": str]:
    """
    :param pandas_df: a ``pandas.DataFrame`` with only one string column
    :return: the same dataframe, but where all lines are valid JSON strings
    """
    pandas_df["value"] = pandas_df["value"].str.replace("[{\n", " ", regex=False)
    pandas_df["value"] = pandas_df["value"].str.replace("}]", " ", regex=False)
    pandas_df["value"] = pandas_df["value"].str.replace("\n", " ", regex=False)
    pandas_df["value"] = "{" + pandas_df["value"] + "}"
    return pandas_df

In [17]:
# let's test that our function works on a real Pandas DataFrames well
import json
import pandas

pandas_df = (song_lines.head().to_pandas().append(song_lines.tail().to_pandas()))
transformed_df = to_a_valid_json(pandas_df)
assert isinstance(transformed_df, pandas.DataFrame)
for _, row in transformed_df.iterrows():
    json.loads(row.value)

In [18]:
# now we can apply our function in a batch mode to the Spark DataFrame
json_lines = song_lines.pandas_on_spark.transform_batch(to_a_valid_json)
json_lines.head()

                                                                                

Unnamed: 0,value
0,"{ ""_id"": { ""$oid"": ""5714dec325ac0d8aee38..."
1,"{ ""_id"": { ""$oid"": ""5714dec325ac0d8aee380..."
2,"{ ""_id"": { ""$oid"": ""5714dec325ac0d8aee380..."
3,"{ ""_id"": { ""$oid"": ""5714dec325ac0d8aee380..."
4,"{ ""_id"": { ""$oid"": ""5714dec325ac0d8aee380..."


In [19]:
# now we write the Spark DataFrame as a simple text files
# line by line to the disk
json_lines.spark.frame().write.mode("overwrite").text("song.json_lines")

                                                                                

In [20]:
# notice that the output is a directory rather than file

!ls song.json_lines

part-00000-e357f437-08f0-4131-9b93-a6a88afa710d-c000.txt
part-00001-e357f437-08f0-4131-9b93-a6a88afa710d-c000.txt
part-00002-e357f437-08f0-4131-9b93-a6a88afa710d-c000.txt
part-00003-e357f437-08f0-4131-9b93-a6a88afa710d-c000.txt
part-00004-e357f437-08f0-4131-9b93-a6a88afa710d-c000.txt
part-00005-e357f437-08f0-4131-9b93-a6a88afa710d-c000.txt
part-00006-e357f437-08f0-4131-9b93-a6a88afa710d-c000.txt
part-00007-e357f437-08f0-4131-9b93-a6a88afa710d-c000.txt
part-00008-e357f437-08f0-4131-9b93-a6a88afa710d-c000.txt
part-00009-e357f437-08f0-4131-9b93-a6a88afa710d-c000.txt
part-00010-e357f437-08f0-4131-9b93-a6a88afa710d-c000.txt
part-00011-e357f437-08f0-4131-9b93-a6a88afa710d-c000.txt
part-00012-e357f437-08f0-4131-9b93-a6a88afa710d-c000.txt
part-00013-e357f437-08f0-4131-9b93-a6a88afa710d-c000.txt
part-00014-e357f437-08f0-4131-9b93-a6a88afa710d-c000.txt
part-00015-e357f437-08f0-4131-9b93-a6a88afa710d-c000.txt
part-00016-e357f437-08f0-4131-9b93-a6a88afa710d-c000.txt
part-00017-e357f437-08f0-4131-9

In [21]:
# let's check that the first line of the first file looks like a valid JSON
!head -1 song.json_lines/part-00000-*

{   "_id": {     "$oid": "5714dec325ac0d8aee3804e7"   },   "position": 0,   "lengthAlbum": "57:52",   "urlSong": "http://lyrics.wikia.com/A:Turn_It_Up",   "lyrics": "Turn it up<br>I don&apos;t know where you&apos;re plugging in<br>Listen up<br>Yeah, we&apos;re all set to begin<br>Turn it up<br>Because we hope you like it loud<br>Tune it up<br>Any second now<br>Making like<br>Cheeky Monkey grins<br>And for tonight<br>I&apos;d love to let you in",   "urlWikipedia": "",   "id_album": {     "$oid": "5714debb25ac0d8aee34d59a"   },   "isClassic": false,   "urlAllmusic": "http://www.allmusic.com/song/mt0013320473",   "urlMusicBrainz": "http://musicbrainz.org/recording/3db608e4-eb72-437e-bc52-872f0a51434f",   "title": "Turn It Up",   "publicationDateAlbum": "1997",   "albumTitle": "How Ace Are Buildings",   "deezer_mapping": [     [       67354194,       "search-exact"     ]   ],   "id_song_deezer": "67354194",   "isrc": "GBAAP9700050",   "length": "93",   "explicitLyrics": false,   "rank": 26

In [22]:
# on a local machine, it makes sense to manually stop Spark
spark.stop()