In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l- \ | / - \ | / - \ | done
[?25h  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824025 sha256=eb1d657c4740bb1774f4e6ee3d82957c5d72f75aa530c1f6f685a65753c7df7c
  Stored in directory: /root/.cache/pip/wheels/5a/54/9b/a89cac960efb57c4c35d41cc7c9f7b80daa21108bc376339b7
Successfully built pyspark
Installing collected packages: py4j, pyspark
  Attempting uninstall: py4j
    F

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import requests, json

from pyspark.sql import SparkSession

- We will start by going through the basics of Spark Sessions 

- Create simple Dataframes

- Understand how Spark Metastore works 

- Then bring in the data from the model API end point https://pyrite-ethereal-soccer.glitch.me/docs

- Then learn to do Transformation and loading into files and other locations

In [3]:
spark = SparkSession. \
    builder. \
    appName('SparkOnKaggle'). \
    getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/08 16:10:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
spark

In [5]:
#Creating spark objects from regular python objects

attendants = [(1, "Smith", "Deo", 1086.0, "Linea"),
             (2, "Kenry", "Jord", 1950.0, "Iran"),
             (3, "Kick", "Boxer", 7550.0, "Kingome"),
             (4, "Gill", "Nome", 1500.0, "New Wold"),
              (5, "Nill", "While", 6500.0, "New Wold"),
              (6, "Will", "Lobe", 18900.0, "Oreleans")
            ]

In [6]:
attendantSDF = spark.createDataFrame(attendants, schema="""attendant_id INT, f_name STRING,
                                     l_name STRING, salary FLOAT, Nationality STRING""")
attendantSDF.show()

                                                                                

+------------+------+------+-------+-----------+
|attendant_id|f_name|l_name| salary|Nationality|
+------------+------+------+-------+-----------+
|           1| Smith|   Deo| 1086.0|      Linea|
|           2| Kenry|  Jord| 1950.0|       Iran|
|           3|  Kick| Boxer| 7550.0|    Kingome|
|           4|  Gill|  Nome| 1500.0|   New Wold|
|           5|  Nill| While| 6500.0|   New Wold|
|           6|  Will|  Lobe|18900.0|   Oreleans|
+------------+------+------+-------+-----------+



In [7]:
attendantSDF.collect()

[Row(attendant_id=1, f_name='Smith', l_name='Deo', salary=1086.0, Nationality='Linea'),
 Row(attendant_id=2, f_name='Kenry', l_name='Jord', salary=1950.0, Nationality='Iran'),
 Row(attendant_id=3, f_name='Kick', l_name='Boxer', salary=7550.0, Nationality='Kingome'),
 Row(attendant_id=4, f_name='Gill', l_name='Nome', salary=1500.0, Nationality='New Wold'),
 Row(attendant_id=5, f_name='Nill', l_name='While', salary=6500.0, Nationality='New Wold'),
 Row(attendant_id=6, f_name='Will', l_name='Lobe', salary=18900.0, Nationality='Oreleans')]

In [8]:
from pyspark.sql.functions import *

In [9]:
#Three different ways to initiate transformation on the dataframes
#select
attendantSDF.select("f_name","l_name",concat('l_name','f_name').alias('full_name')).show()

+------+------+---------+
|f_name|l_name|full_name|
+------+------+---------+
| Smith|   Deo| DeoSmith|
| Kenry|  Jord|JordKenry|
|  Kick| Boxer|BoxerKick|
|  Gill|  Nome| NomeGill|
|  Nill| While|WhileNill|
|  Will|  Lobe| LobeWill|
+------+------+---------+



In [10]:
#Using withColumn method
attendantSDF.withColumn('new_name',concat('Nationality',lit(' ** '),'f_name')).show()

+------------+------+------+-------+-----------+----------------+
|attendant_id|f_name|l_name| salary|Nationality|        new_name|
+------------+------+------+-------+-----------+----------------+
|           1| Smith|   Deo| 1086.0|      Linea|  Linea ** Smith|
|           2| Kenry|  Jord| 1950.0|       Iran|   Iran ** Kenry|
|           3|  Kick| Boxer| 7550.0|    Kingome| Kingome ** Kick|
|           4|  Gill|  Nome| 1500.0|   New Wold|New Wold ** Gill|
|           5|  Nill| While| 6500.0|   New Wold|New Wold ** Nill|
|           6|  Will|  Lobe|18900.0|   Oreleans|Oreleans ** Will|
+------------+------+------+-------+-----------+----------------+



In [11]:
#Using withColumn method
attendantSDF.selectExpr("salary * 2 AS new_salary", "CONCAT(f_name,l_name) AS new_name").show()

+----------+---------+
|new_salary| new_name|
+----------+---------+
|    2172.0| SmithDeo|
|    3900.0|KenryJord|
|   15100.0|KickBoxer|
|    3000.0| GillNome|
|   13000.0|NillWhile|
|   37800.0| WillLobe|
+----------+---------+



In [12]:
attendantSDF.drop('Nationality').show()

+------------+------+------+-------+
|attendant_id|f_name|l_name| salary|
+------------+------+------+-------+
|           1| Smith|   Deo| 1086.0|
|           2| Kenry|  Jord| 1950.0|
|           3|  Kick| Boxer| 7550.0|
|           4|  Gill|  Nome| 1500.0|
|           5|  Nill| While| 6500.0|
|           6|  Will|  Lobe|18900.0|
+------------+------+------+-------+



In [13]:
type(attendantSDF.drop('Nationality').collect()[0])

pyspark.sql.types.Row

In [14]:
attendantSDF.schema

StructType([StructField('attendant_id', IntegerType(), True), StructField('f_name', StringType(), True), StructField('l_name', StringType(), True), StructField('salary', FloatType(), True), StructField('Nationality', StringType(), True)])

In [15]:
attendantSDF.printSchema()

root
 |-- attendant_id: integer (nullable = true)
 |-- f_name: string (nullable = true)
 |-- l_name: string (nullable = true)
 |-- salary: float (nullable = true)
 |-- Nationality: string (nullable = true)



In [16]:
attendantSDF.write.parquet(path='/kaggle/working/parquet',
                          mode='overwrite',
                          compression='snappy')

                                                                                

In [17]:
!ls /kaggle/working/parquet/

_SUCCESS
part-00000-cab34eaa-ca9b-4c64-a5cf-f8f39a25f885-c000.snappy.parquet
part-00001-cab34eaa-ca9b-4c64-a5cf-f8f39a25f885-c000.snappy.parquet
part-00002-cab34eaa-ca9b-4c64-a5cf-f8f39a25f885-c000.snappy.parquet
part-00003-cab34eaa-ca9b-4c64-a5cf-f8f39a25f885-c000.snappy.parquet


In [18]:
new_parquet_read = spark.read.parquet('/kaggle/working/parquet')
new_parquet_read.show()

+------------+------+------+-------+-----------+
|attendant_id|f_name|l_name| salary|Nationality|
+------------+------+------+-------+-----------+
|           4|  Gill|  Nome| 1500.0|   New Wold|
|           5|  Nill| While| 6500.0|   New Wold|
|           6|  Will|  Lobe|18900.0|   Oreleans|
|           2| Kenry|  Jord| 1950.0|       Iran|
|           3|  Kick| Boxer| 7550.0|    Kingome|
|           1| Smith|   Deo| 1086.0|      Linea|
+------------+------+------+-------+-----------+



In [19]:
attendantSDF.coalesce(1).write.mode('append').option("compression","snappy"). \
    option('sep','|').format('csv').save('/kaggle/working/csv/')

In [20]:
!ls /kaggle/working/csv/

_SUCCESS  part-00000-aa5b6bbc-4730-4bd2-990f-5f6aa79fd6e3-c000.csv.snappy


In [21]:
attendantSDF.coalesce(1).write.mode('overwrite').option("compression","gzip"). \
    option('sep','|').format('csv').save('/kaggle/working/csv_gzip/')

In [22]:
!ls /kaggle/working/csv_gzip/

_SUCCESS  part-00000-1328472d-71cb-423b-be0b-ed6067d5dcb6-c000.csv.gz


In [23]:
spark.conf.set("spark.sql.shuffle.partitions",'2')

In [24]:
spark.sql("DROP DATABASE IF EXISTS trial_database")
spark.sql("CREATE DATABASE trial_database")
spark.sql("USE DATABASE trial_database")

DataFrame[]

In [25]:
spark.catalog.currentDatabase()

'trial_database'

In [26]:
spark.catalog.listDatabases()

[Database(name='default', description='default database', locationUri='file:/kaggle/working/spark-warehouse'),
 Database(name='trial_database', description='', locationUri='file:/kaggle/working/spark-warehouse/trial_database.db')]

In [27]:
attendantSDF.write.saveAsTable("attendants", mode='overwrite',partitionBy='Nationality')

In [28]:
spark.catalog.listTables()

[Table(name='attendants', database='trial_database', description=None, tableType='MANAGED', isTemporary=False)]

In [29]:
spark.sql('DESCRIBE FORMATTED attendants').show(truncate=False)

+----------------------------+-----------------------------------------------------------------+-------+
|col_name                    |data_type                                                        |comment|
+----------------------------+-----------------------------------------------------------------+-------+
|attendant_id                |int                                                              |null   |
|f_name                      |string                                                           |null   |
|l_name                      |string                                                           |null   |
|salary                      |float                                                            |null   |
|Nationality                 |string                                                           |null   |
|# Partition Information     |                                                                 |       |
|# col_name                  |data_type                

In [30]:
spark.catalog.createTable("attendants_new_csv",
                                  path='/kaggle/working/csv/', 
                                 source='csv',
                                 sep='|',
                                 header='true',
                                 inferSchema='true')

DataFrame[1: int, Smith: string, Deo: string, 1086.0: double, Linea: string]

In [31]:
spark.sql('DESCRIBE FORMATTED attendants_new_csv').show(truncate=False)

+----------------------------+--------------------------------------+-------+
|col_name                    |data_type                             |comment|
+----------------------------+--------------------------------------+-------+
|1                           |int                                   |null   |
|Smith                       |string                                |null   |
|Deo                         |string                                |null   |
|1086.0                      |double                                |null   |
|Linea                       |string                                |null   |
|                            |                                      |       |
|# Detailed Table Information|                                      |       |
|Database                    |trial_database                        |       |
|Table                       |attendants_new_csv                    |       |
|Created Time                |Wed Mar 08 16:10:35 UTC 2023      

In [32]:
spark.sql(f"DROP DATABASE IF EXISTS trial_database CASCADE")

DataFrame[]

In [33]:
spark.catalog.listDatabases()

[Database(name='default', description='default database', locationUri='file:/kaggle/working/spark-warehouse')]

### Starting the ETL process from an API

https://pyrite-ethereal-soccer.glitch.me/docs

https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.json

In [34]:
url = 'https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.json'
res = requests.request(method='GET',url=url)
data_json = res.json()

In [35]:
main_data = data_json['rows']

In [36]:
spark.stop()

In [37]:
etl_spark = SparkSession. \
    builder. \
    appName('ETL_spark'). \
    getOrCreate()

In [38]:
etl_spark

In [39]:
api_datadf = etl_spark.createDataFrame(data=main_data)

In [40]:
api_datadf.schema

StructType([StructField('download_count', LongType(), True), StructField('project', StringType(), True)])

In [41]:
api_datadf.printSchema()

root
 |-- download_count: long (nullable = true)
 |-- project: string (nullable = true)



In [42]:
# A bit more involved api.. using the pyrite api
pyriteurl = "https://pyrite-ethereal-soccer.glitch.me/biker/v1/deliveryd"
pyriteRes = requests.request(method='GET',url=pyriteurl)
pyriteData = pyriteRes.json()
pyriteData

[{'ID': '0x2318',
  'Delivery_person_ID': 'COIMBRES13DEL01',
  'Type_of_Vehicle': 'electric_scooter',
  'DeliveryPersonAge': 29.5,
  'DeliveryPersonRatings': 4.6,
  'TypeOfVehicle': 2.0,
  'DeliveryPersonID': 2.0},
 {'ID': '0x3474',
  'Delivery_person_ID': 'BANGRES15DEL01',
  'Type_of_Vehicle': 'motorcycle',
  'DeliveryPersonAge': 28.0,
  'DeliveryPersonRatings': 4.6,
  'TypeOfVehicle': 0.0,
  'DeliveryPersonID': 266.0}]

In [43]:
pyriteSDF = etl_spark.createDataFrame(data = pyriteData)
pyriteSDF.show()

                                                                                

+-----------------+----------------+---------------------+------------------+------+-------------+----------------+
|DeliveryPersonAge|DeliveryPersonID|DeliveryPersonRatings|Delivery_person_ID|    ID|TypeOfVehicle| Type_of_Vehicle|
+-----------------+----------------+---------------------+------------------+------+-------------+----------------+
|             29.5|             2.0|                  4.6|   COIMBRES13DEL01|0x2318|          2.0|electric_scooter|
|             28.0|           266.0|                  4.6|    BANGRES15DEL01|0x3474|          0.0|      motorcycle|
+-----------------+----------------+---------------------+------------------+------+-------------+----------------+



In [44]:
pyriteSDF.printSchema()

root
 |-- DeliveryPersonAge: double (nullable = true)
 |-- DeliveryPersonID: double (nullable = true)
 |-- DeliveryPersonRatings: double (nullable = true)
 |-- Delivery_person_ID: string (nullable = true)
 |-- ID: string (nullable = true)
 |-- TypeOfVehicle: double (nullable = true)
 |-- Type_of_Vehicle: string (nullable = true)



In [45]:
#Writing the data to spark catalog
etl_spark.sql("CREATE DATABASE IF NOT EXISTS api_database")
etl_spark.sql("USE DATABASE api_database")
etl_spark.catalog.listDatabases()

[Database(name='api_database', description='', locationUri='file:/kaggle/working/spark-warehouse/api_database.db'),
 Database(name='default', description='default database', locationUri='file:/kaggle/working/spark-warehouse')]

In [46]:
pyriteSDF.write.saveAsTable("pyrite_table")
etl_spark.sql("SELECT * FROM pyrite_table").show(truncate=False)

+-----------------+----------------+---------------------+------------------+------+-------------+----------------+
|DeliveryPersonAge|DeliveryPersonID|DeliveryPersonRatings|Delivery_person_ID|ID    |TypeOfVehicle|Type_of_Vehicle |
+-----------------+----------------+---------------------+------------------+------+-------------+----------------+
|29.5             |2.0             |4.6                  |COIMBRES13DEL01   |0x2318|2.0          |electric_scooter|
|28.0             |266.0           |4.6                  |BANGRES15DEL01    |0x3474|0.0          |motorcycle      |
+-----------------+----------------+---------------------+------------------+------+-------------+----------------+



In [47]:
etl_spark.sql("SHOW TABLES").show()

+------------+------------+-----------+
|   namespace|   tableName|isTemporary|
+------------+------------+-----------+
|api_database|pyrite_table|      false|
+------------+------------+-----------+



In [48]:
etl_spark.sql("SHOW VIEWS").show()

+---------+--------+-----------+
|namespace|viewName|isTemporary|
+---------+--------+-----------+
+---------+--------+-----------+



In [49]:
etl_spark.sql("SHOW DATABASES").show()

+------------+
|   namespace|
+------------+
|api_database|
|     default|
+------------+



In [50]:
etl_spark.catalog.listTables()

[Table(name='pyrite_table', database='api_database', description=None, tableType='MANAGED', isTemporary=False)]

In [51]:
pyriteSDF.coalesce(1).write.csv(path='/kaggle/working/pyrite_csv',
                               sep='|',header=True,
                               mode='overwrite')

In [52]:
!ls /kaggle/working/pyrite_csv/

_SUCCESS  part-00000-19599321-ebe4-4223-b145-2594d3677d78-c000.csv


In [53]:
!cat /kaggle/working/pyrite_csv/part-00000-fbd2b9b0-c6f1-4d69-80c9-7a0497eab97b-c000.csv

cat: /kaggle/working/pyrite_csv/part-00000-fbd2b9b0-c6f1-4d69-80c9-7a0497eab97b-c000.csv: No such file or directory
