# Spark Session

All distributed operations with Spark are done using so-called Spark Session. Usually one is already created by your cluster's administrator.

In [0]:
spark

# Wasabi dataset

For this course, we will use [Wasabi dataset](https://doi.org/10.5281/zenodo.5603369), the same you have in the data visualisation course.

In [0]:
import os

# to download it faster, we use a Google Storage bucket
cloud_storage = "gs"
# or (on Databrick Community Editions it's the only choice) --- S3 bucket
cloud_storage = "s3"
# here we download the archive to the driver machine (linux filesystem)
dbutils.fs.cp(
    f"{cloud_storage}://wasabi-dataset/wasabi-2-0.tar",
    f"file://{os.getcwd()}"
)

Out[3]: True

In [0]:
# we need that because unarchive tools generaly won't work on distributed file systems
# notice a ``json.zip`` archive containing the data we need
!tar -xvf ./wasabi-2-0.tar

json/
json/json.zip
rdf/
rdf/song_chords.zip
rdf/artist.zip
rdf/wsb-2.0.ttl
rdf/song_tags_and_emotions.zip
rdf/album.zip
rdf/wasabi-metadata-dataset.ttl
rdf/song_core.zip
rdf/song_topics.zip


In [0]:
# we plan to work with JSON data, no RDF
!unzip ./json/json.zip

Archive:  ./json/json.zip
  inflating: emotion-tags.json       
  inflating: social-tags.json        
  inflating: song.json               
  inflating: song-topic.json         
  inflating: topic-models.json       
  inflating: album.json              
  inflating: artist-members.json     
  inflating: artist-without-members.json  


In [0]:
# use "album.json" for Databricks Community Edition (only 10GB of DBFS)
file_name = "song.json"
# this is a local (linux) path
file_path = os.path.join(os.getcwd(), file_name)
print(file_path)

/databricks/driver/album.json


In [0]:
# notice that this file is one large JSON object (a list of dictionaries)
# this is not particularly distributed-computations-friendly
# also notice that different songs are separated by a line containing only
# three symbols: },{
!head -500 {file_path}

[{
  "_id": {
    "$oid": "5714debb25ac0d8aee34d59a"
  },
  "name": "A",
  "urlWikipedia": "http://en.wikipedia.org/wiki/How_Ace_Are_Buildings",
  "genre": "Alternative Rock",
  "length": "57:52",
  "urlAlbum": "http://lyrics.wikia.com/A:How_Ace_Are_Buildings_%281997%29",
  "id_artist": {
    "$oid": "56d7e91b6b60c09814f93e4a"
  },
  "rdf": "<?xml version='1.0' encoding='utf-8' ?> <rdf:RDF  xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'  xmlns:rdfs='http://www.w3.org/2000/01/rdf-schema#'  xmlns:dct='http://purl.org/dc/terms/'  xmlns:dbo='http://dbpedia.org/ontology/' >   <rdf:Description rdf:about='http://dbpedia.org/resource/How_Ace_Are_Buildings'>     <dct:subject rdf:resource='http://dbpedia.org/resource/Category:A_(band)_albums' />     <dct:subject rdf:resource='http://dbpedia.org/resource/Category:1997_albums' />     <dbo:abstract xml:lang='en'>How Ace Are Buildings is the debut album by British alternative rock band A, released in 1997.The album was re-releas

In [0]:
from pyspark import pandas as pd

# by default, Pandas API creates a local index
pd.set_option("compute.default_index_type", "distributed")

In [0]:
# now we upload our songs JSON to DBFS (DataBricks File System)
# it's similar to GFS (Google File System), HDFS (Hadoop Distributed File System), and Amazon S3
# using the dbfs:// prefix is not obligatory, it's DBFS by default
dbutils.fs.cp(f"file://{file_path}", f"dbfs:///{file_name}")

Out[9]: True

In [0]:
# check that the songs data appeared in the DBFS
dbutils.fs.ls("dbfs:///")

Out[10]: [FileInfo(path='dbfs:/album.json', name='album.json', size=473245518, modificationTime=1663089384000),
 FileInfo(path='dbfs:/databricks/', name='databricks/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/databricks-datasets/', name='databricks-datasets/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/databricks-results/', name='databricks-results/', size=0, modificationTime=0)]

In [0]:
# Spark can read simple text files and split them into lines
# using any character sequence you wish

raw_lines = spark.read.text(
    f"dbfs:///{file_name}",
    lineSep="\n},{\n"
).to_pandas_on_spark()

In [0]:
# now we have one JSON per line

raw_lines.head()

Unnamed: 0,value
0,"[{\n ""_id"": {\n ""$oid"": ""5714debb25ac0d8ae..."
1,"""_id"": {\n ""$oid"": ""5714debb25ac0d8aee34d..."
2,"""_id"": {\n ""$oid"": ""5714debb25ac0d8aee34d..."
3,"""_id"": {\n ""$oid"": ""5714debb25ac0d8aee34d..."
4,"""_id"": {\n ""$oid"": ""5714debb25ac0d8aee34d..."


In [0]:
# and around 2.1M songs (or 210000 albums) in total

raw_lines.count()

Out[13]: value    208743
dtype: int64

In [0]:
# notice that ``raw_lines`` is not a real ``pandas.DataFrame``

type(raw_lines)

Out[14]: pyspark.pandas.frame.DataFrame

In [0]:
# it's only a wrapper around a Spark DataFrame

print(raw_lines.spark.frame())
type(raw_lines.spark.frame())

DataFrame[value: string]
Out[15]: pyspark.sql.dataframe.DataFrame

In [0]:
# which in turn is a wrapper around something called RDD
# (Resilient Distributed Dataset)

print(raw_lines.spark.frame().rdd)
type(raw_lines.spark.frame().rdd)

MapPartitionsRDD[28] at javaToPython at NativeMethodAccessorImpl.java:0
Out[16]: pyspark.rdd.RDD

In [0]:
# RDD is distributed
# here it was spread over several partitions of roughly the same size

raw_lines.spark.frame().rdd.getNumPartitions()

Out[17]: 8

In [0]:
# RDD is resilient which means it's not data
# it's only a sequence of instructions, so-called lineage

raw_lines.spark.explain()

== Physical Plan ==
*(1) Project [monotonically_increasing_id() AS __index_level_0__#30L, value#28]
+- FileScan text [value#28] Batched: false, DataFilters: [], Format: Text, Location: InMemoryFileIndex(1 paths)[dbfs:/album.json], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<value:string>




In [0]:
# ``to_pandas`` method collects the data from all workers to driver
# usually that will kill the driver by OOM (out-of-memory error)
# but if we take only the head, that's OK

pandas_df = raw_lines.head().to_pandas()
print(type(pandas_df))
pandas_df

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,value
0,"[{\n ""_id"": {\n ""$oid"": ""5714debb25ac0d8ae..."
1,"""_id"": {\n ""$oid"": ""5714debb25ac0d8aee34d..."
2,"""_id"": {\n ""$oid"": ""5714debb25ac0d8aee34d..."
3,"""_id"": {\n ""$oid"": ""5714debb25ac0d8aee34d..."
4,"""_id"": {\n ""$oid"": ""5714debb25ac0d8aee34d..."


In [0]:
def to_a_valid_json(
    pandas_df: pd.DataFrame["value": str]
) -> pd.DataFrame["value": str]:
    """
    :param pandas_df: a ``pandas.DataFrame`` with only one string column
    :return: the same dataframe, but where all lines are valid JSON strings
    """
    pandas_df["value"] = pandas_df["value"].str.replace("[{\n", " ", regex=False)
    pandas_df["value"] = pandas_df["value"].str.replace("}]", " ", regex=False)
    pandas_df["value"] = pandas_df["value"].str.replace("\n", " ", regex=False)
    pandas_df["value"] = "{" + pandas_df["value"] + "}"
    return pandas_df

In [0]:
# let's test that our function works on a real Pandas DataFrames well
import json
import pandas

pandas_df = (raw_lines.head().to_pandas().append(raw_lines.tail().to_pandas()))
transformed_df = to_a_valid_json(pandas_df)
assert isinstance(transformed_df, pandas.DataFrame)
for _, row in transformed_df.iterrows():
    json.loads(row.value)

In [0]:
# now we can apply our function in a batch mode to the Spark DataFrame
json_lines = raw_lines.pandas_on_spark.transform_batch(to_a_valid_json)
json_lines.head()

Unnamed: 0,value
0,"{ ""_id"": { ""$oid"": ""5714debb25ac0d8aee34..."
1,"{ ""_id"": { ""$oid"": ""5714debb25ac0d8aee34d..."
2,"{ ""_id"": { ""$oid"": ""5714debb25ac0d8aee34d..."
3,"{ ""_id"": { ""$oid"": ""5714debb25ac0d8aee34d..."
4,"{ ""_id"": { ""$oid"": ""5714debb25ac0d8aee34d..."


In [0]:
# now we write the Spark DataFrame as a simple text files
# line by line to the disk
json_lines.spark.frame().write.mode("overwrite").text("data.json_lines")

In [0]:
# notice that the output is a directory rather than file

dbutils.fs.ls("data.json_lines")

Out[24]: [FileInfo(path='dbfs:/data.json_lines/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1663089501000),
 FileInfo(path='dbfs:/data.json_lines/_committed_6346175295456222252', name='_committed_6346175295456222252', size=728, modificationTime=1663089501000),
 FileInfo(path='dbfs:/data.json_lines/_started_6346175295456222252', name='_started_6346175295456222252', size=0, modificationTime=1663089459000),
 FileInfo(path='dbfs:/data.json_lines/part-00000-tid-6346175295456222252-310276f2-3c2d-427d-be84-efb67d8d9c44-63-1-c000.txt', name='part-00000-tid-6346175295456222252-310276f2-3c2d-427d-be84-efb67d8d9c44-63-1-c000.txt', size=59626419, modificationTime=1663089494000),
 FileInfo(path='dbfs:/data.json_lines/part-00001-tid-6346175295456222252-310276f2-3c2d-427d-be84-efb67d8d9c44-64-1-c000.txt', name='part-00001-tid-6346175295456222252-310276f2-3c2d-427d-be84-efb67d8d9c44-64-1-c000.txt', size=59635859, modificationTime=1663089496000),
 FileInfo(path='dbfs:/data.json_lines/part-00002