### Basic practice in using Lambda + map functions with PySpark

Package Import

In [None]:
### 
# You might have noticed this code in the screencast.
#
# import findspark
# findspark.init('spark-2.3.2-bin-hadoop2.7')
#
# The findspark Python module makes it easier to install
# Spark in local mode on your computer. This is convenient
# for practicing Spark syntax locally. 
# However, the workspaces already have Spark installed and you do not
# need to use the findspark module
#
###

from pyspark import SparkConf, SparkContext

Setting up Spark Context

In [None]:
config = SparkConf().setAppName("maps_and_lazy_evaluation_example").setMaster("local[*]")

if('sc' in locals()):
    sc.stop()

sc = SparkContext(conf = config)

#Simple method:
#sc = SparkContext(appName="maps_and_lazy_evaluation_example")

Importing Data

In [None]:
log_of_songs = [
        "Despacito",
        "Nice for what",
        "No tears left to cry",
        "Despacito",
        "Havana",
        "In my feelings",
        "Nice for what",
        "despacito",
        "All the stars"
]

Loading data and operations into Spark - to be processed in a parallel fashion

---

Question: What is Spark actually doing here? Does it split up the file into smaller chunks for parallel processing? What are the other options for loading data into Spark?

In [None]:
#Note: These commands appear to be run instantaneously, but they are not. Rather, we are LOADING instructions into Spark.
#Once we are done (and Spark knows our entire process), Spark will optimize the DAG and perform the operations.

'''Note: I think it's quite important to initialize the data in the same statement as all of the operations.
By doing this, there's a low risk that any operations might be added again to the DAG by re-running a command.
If this current command is re-run, the entire DAG is overwritten.'''


# parallelize the log_of_songs to use with Spark
distributed_song_log = sc.parallelize(log_of_songs)

#Rather, we are definin
distributed_song_log = distributed_song_log.map(lambda x: x.lower())

In [None]:
#To force Spark to perform the operations you've specified, use the 'collect' method:
results = distributed_song_log.collect()
results

# Using SparkSession to create a Data Frame

Importing modules

In [None]:
from pyspark.sql import SparkSession

In [None]:
#Note that Spark only allows one Spark context and one Spark session to be defined at any time.
#In the code below, 'GetOrCreate' will either create the Spark session or modify the existing one.

sparkSesh = SparkSession \
    .builder \
    .appName("app Name") \
    .config('config option','config value') \
    .getOrCreate()

Look at parameters of the spark context

In [None]:
sparkSesh.sparkContext.getConf().getAll()

Use Spark to read JSON from file

In [6]:
path = "./sparkify_log_small.json"
log_data = sparkSesh.read.json(path)

                                                                                

For any organized data file, print out the keys / headers of the data

In [7]:
log_data.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [8]:
#Similar to 'str' function (structure) in R for describing data frames
log_data.describe()

                                                                                

DataFrame[summary: string, artist: string, auth: string, firstName: string, gender: string, itemInSession: string, lastName: string, length: string, level: string, location: string, method: string, page: string, registration: string, sessionId: string, song: string, status: string, ts: string, userAgent: string, userId: string]

In [10]:
#Retrieve first 2 records
log_data.take(2)

[Row(artist='Showaddywaddy', auth='Logged In', firstName='Kenneth', gender='M', itemInSession=112, lastName='Matthews', length=232.93342, level='paid', location='Charlotte-Concord-Gastonia, NC-SC', method='PUT', page='NextSong', registration=1509380319284, sessionId=5132, song='Christmas Tears Will Fall', status=200, ts=1513720872284, userAgent='"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"', userId='1046'),
 Row(artist='Lily Allen', auth='Logged In', firstName='Elizabeth', gender='F', itemInSession=7, lastName='Chase', length=195.23873, level='free', location='Shreveport-Bossier City, LA', method='PUT', page='NextSong', registration=1512718541284, sessionId=5027, song='Cheryl Tweedy', status=200, ts=1513720878284, userAgent='"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"', userId='1000')]

Write data out

In [11]:
#This is cool - check out how Spark saves the file. For me, it's actually not a CSV at all, it's a FOLDER
#which has some metadata files and then multiple partitioned files. This is pretty cool - Spark is automatically
#partitioning my files, but keeping the abstraction very basic on my level.

write_path = "./sparkify_log_small.csv"
log_data.write.save(write_path, format = "csv", header = True)

In [12]:
#Read in file we just created:
written_log_file = sparkSesh.read.csv(write_path, header = True)

In [14]:
written_log_file.take(2)

[Row(artist='Showaddywaddy', auth='Logged In', firstName='Kenneth', gender='M', itemInSession='112', lastName='Matthews', length='232.93342', level='paid', location='Charlotte-Concord-Gastonia, NC-SC', method='PUT', page='NextSong', registration='1509380319284', sessionId='5132', song='Christmas Tears Will Fall', status='200', ts='1513720872284', userAgent='"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"', userId='1046'),
 Row(artist='Lily Allen', auth='Logged In', firstName='Elizabeth', gender='F', itemInSession='7', lastName='Chase', length='195.23873', level='free', location='Shreveport-Bossier City, LA', method='PUT', page='NextSong', registration='1512718541284', sessionId='5027', song='Cheryl Tweedy', status='200', ts='1513720878284', userAgent='"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"', userId='1000')]

In [16]:
written_log_file.select("artist").show()

+--------------------+
|              artist|
+--------------------+
|       Showaddywaddy|
|          Lily Allen|
|Cobra Starship Fe...|
|          Alex Smoke|
|                null|
|                null|
|              Redman|
|     Ulrich Schnauss|
|                null|
|                null|
|               Jay-Z|
|         Evanescence|
|     Scissor Sisters|
|        3 Doors Down|
|       George Younce|
|              Aly-Us|
|                null|
|            BjÃÂ¶rk|
|      David Bromberg|
|          Nickelback|
+--------------------+
only showing top 20 rows



In [25]:
written_log_file.select("artist").dropDuplicates().sort('artist').show()

+--------------------+
|              artist|
+--------------------+
|                null|
|        3 Doors Down|
|                 311|
|       36 Crazyfists|
|               3OH!3|
|            3rd Bass|
|           3rd Force|
|             50 Cent|
|   A Day To Remember|
|      A Skylit Drive|
|A Tribe Called Quest|
|                A-Ha|
|                 AFI|
|       Aaron Neville|
|                Abba|
|         Ace of Base|
|       Action Action|
|          Adam Green|
|        Adam Lambert|
|           Aerosmith|
+--------------------+
only showing top 20 rows



In [33]:
written_log_file \
    .where(written_log_file.artist != "null") \
    .groupBy('artist') \
    .count() \
    .orderBy('count', ascending=False) \
    .show()

+--------------------+-----+
|              artist|count|
+--------------------+-----+
|       Kings Of Leon|   17|
|            Coldplay|   16|
|Florence + The Ma...|   15|
|        Jack Johnson|   13|
|            BjÃÂ¶rk|   10|
|       Justin Bieber|   10|
|      The Black Keys|   10|
|          Lily Allen|    9|
|           Daft Punk|    9|
|            Tub Ring|    9|
|         OneRepublic|    8|
|           Radiohead|    7|
|     Alliance Ethnik|    7|
|        Taylor Swift|    6|
|          Kanye West|    6|
|             Rihanna|    6|
|         Miley Cyrus|    6|
|     Michael Jackson|    6|
|Red Hot Chili Pep...|    6|
|      Arctic Monkeys|    6|
+--------------------+-----+
only showing top 20 rows



In [35]:
written_log_file \
    .select(["userId","page","song"]) \
    .where(written_log_file.userId == "1046") \
    .collect()

[Row(userId='1046', page='NextSong', song='Christmas Tears Will Fall'),
 Row(userId='1046', page='NextSong', song='Be Wary Of A Woman'),
 Row(userId='1046', page='NextSong', song='Public Enemy No.1'),
 Row(userId='1046', page='NextSong', song='Reign Of The Tyrants'),
 Row(userId='1046', page='NextSong', song='Father And Son'),
 Row(userId='1046', page='NextSong', song='No. 5'),
 Row(userId='1046', page='NextSong', song='Seventeen')]

Making a user-defined function with Spark

In [44]:
from datetime import datetime
from pyspark.sql.functions import udf


get_hour = udf(lambda x: datetime.fromtimestamp(float(x) / 1000.0). hour)

In [45]:
written_log_file = written_log_file.withColumn("hour", get_hour(written_log_file.ts))

In [47]:
written_log_file.head()

Row(artist='Showaddywaddy', auth='Logged In', firstName='Kenneth', gender='M', itemInSession='112', lastName='Matthews', length='232.93342', level='paid', location='Charlotte-Concord-Gastonia, NC-SC', method='PUT', page='NextSong', registration='1509380319284', sessionId='5132', song='Christmas Tears Will Fall', status='200', ts='1513720872284', userAgent='"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"', userId='1046', hour='23')