RDD = Resilient Distributed Datasets

it supports in-memory processing computation => it stores the state of memory as an object across the jobs and the object is sharable between those jobs.

In [1]:
from pyspark.sql import SparkSession


spark=SparkSession.builder.appName('SPSQL').getOrCreate()
spark

In [2]:
spark.catalog.listTables()

[]

In [3]:
from pyspark import SparkFiles


url='https://raw.githubusercontent.com/justkacz/csvfiles/main/births.csv'

spark.sparkContext.addFile(url)

df=spark.read.csv(SparkFiles.get('births.csv'), header=True, inferSchema=True)
df.show(5)

+----+-----+---+------+------+
|year|month|day|gender|births|
+----+-----+---+------+------+
|1969|    1|  1|     F|  4046|
|1969|    1|  1|     M|  4440|
|1969|    1|  2|     F|  4454|
|1969|    1|  2|     M|  4548|
|1969|    1|  3|     F|  4548|
+----+-----+---+------+------+
only showing top 5 rows



In [4]:
spark.catalog.listTables()

# or using sql:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [5]:
spark.catalog.listDatabases()

# or using sql:
spark.sql('show databases').show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [6]:
# current database:

spark.catalog.currentDatabase()

'default'

In [7]:
# creating a new database:
spark.sql('create database sparksql')

DataFrame[]

In [8]:
spark.sql('show databases').show()

+---------+
|namespace|
+---------+
|  default|
| sparksql|
+---------+



In [9]:
#creating temporary table in a default database:

df.createOrReplaceTempView('dfsql')

In [10]:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |    dfsql|       true|
+---------+---------+-----------+



In [11]:
query= "from dfsql select * limit 10"

df_10=spark.sql(query)
df_10.show(5)

+----+-----+---+------+------+
|year|month|day|gender|births|
+----+-----+---+------+------+
|1969|    1|  1|     F|  4046|
|1969|    1|  1|     M|  4440|
|1969|    1|  2|     F|  4454|
|1969|    1|  2|     M|  4548|
|1969|    1|  3|     F|  4548|
+----+-----+---+------+------+
only showing top 5 rows



In [12]:
# converting spark DataFrame to a pandas DataFrame:
query=('from dfsql select year, gender, sum(births) as tot_births group by year, gender order by year')
dfs=spark.sql(query)
dfspd=dfs.toPandas()
dfspd  # pandas df cannot be used with sql methods -> its not included in the table/view catalog 
# (first must be converted = .createOrReplaceTempView())

Unnamed: 0,year,gender,tot_births
0,1969,F,1753634
1,1969,M,1846572
2,1970,M,1918636
3,1970,F,1819164
4,1971,F,1736774
...,...,...,...
75,2006,M,2188268
76,2007,F,2111890
77,2007,M,2212118
78,2008,M,2177227


In [13]:
import pandas as pd
import numpy as np

rng=np.random.RandomState(0)

# converting pandas DF to spark DF, spark DF is stored LOCALLY not in the SparkSession catalog -> data cannot be accessed in 
# other context

#pandas DF:
pddf=pd.DataFrame(rng.randn(12).reshape(4,3), columns=['a', 'b', 'c'])
pddf

Unnamed: 0,a,b,c
0,1.764052,0.400157,0.978738
1,2.240893,1.867558,-0.977278
2,0.950088,-0.151357,-0.103219
3,0.410599,0.144044,1.454274


In [14]:
#converting pandas DF to spark DF
sdf=spark.createDataFrame(pddf)

#adding spark DF to the catalog
sdf.createOrReplaceTempView('sdf')

In [15]:
spark.catalog.listTables()

[Table(name='dfsql', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='sdf', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [16]:
df.select('gender').show(5)

+------+
|gender|
+------+
|     F|
|     M|
|     F|
|     M|
|     F|
+------+
only showing top 5 rows



In [17]:
# pyspark DF -> filter
df.filter(df['gender']=='F').show(5)

+----+-----+---+------+------+
|year|month|day|gender|births|
+----+-----+---+------+------+
|1969|    1|  1|     F|  4046|
|1969|    1|  2|     F|  4454|
|1969|    1|  3|     F|  4548|
|1969|    1|  4|     F|  4440|
|1969|    1|  5|     F|  4192|
+----+-----+---+------+------+
only showing top 5 rows



In [18]:
# or using sql query:
spark.sql('from dfsql select * where gender = "F" limit 5').show()

+----+-----+---+------+------+
|year|month|day|gender|births|
+----+-----+---+------+------+
|1969|    1|  1|     F|  4046|
|1969|    1|  2|     F|  4454|
|1969|    1|  3|     F|  4548|
|1969|    1|  4|     F|  4440|
|1969|    1|  5|     F|  4192|
+----+-----+---+------+------+



In [19]:
url2='https://raw.githubusercontent.com/justkacz/csvfiles/main/Dane%20NW2.csv'

spark.sparkContext.addFile(url2)

df2=spark.read.option("delimiter", ";").csv(SparkFiles.get('Dane NW2.csv'), inferSchema=True, header=True)
df2.show(5)

+-------+------+--------------+-----+
|OrderID|  from| Category Name|sales|
+-------+------+--------------+-----+
|  10248|France|Dairy Products|  168|
|  10248|France|Grains/Cereals|   98|
|  10248|France|Dairy Products|  174|
|  10249|France|       Produce|167,4|
|  10249|France|       Produce| 1696|
+-------+------+--------------+-----+
only showing top 5 rows



In [20]:
#df2.filter(df2['Category Name'].contains('Prod')).show()
# or:
#df2.filter(df2['Category Name'].like('%Prod%')).show()
#or
from pyspark.sql.functions import col
df2.filter(col("Category Name").like('%Prod%')).show(5)

+-------+-------+--------------+-----+
|OrderID|   from| Category Name|sales|
+-------+-------+--------------+-----+
|  10248| France|Dairy Products|  168|
|  10248| France|Dairy Products|  174|
|  10249| France|       Produce|167,4|
|  10249| France|       Produce| 1696|
|  10250|Belarus|       Produce| 1484|
+-------+-------+--------------+-----+
only showing top 5 rows



In [21]:
# or filtering only column names with specific pattern by using regex function:

df2.select(df2.colRegex("`(Categ)+?.+`")).show(5)

+--------------+
| Category Name|
+--------------+
|Dairy Products|
|Grains/Cereals|
|Dairy Products|
|       Produce|
|       Produce|
+--------------+
only showing top 5 rows



In [22]:
#df2.select(df2.colRegex("`^.*ateg*`")).show()
df2.select(df2.colRegex("`(Categ)+?.+`")).show(3)

+--------------+
| Category Name|
+--------------+
|Dairy Products|
|Grains/Cereals|
|Dairy Products|
+--------------+
only showing top 3 rows



In [23]:
#selecting particular columns (second and third) by index:

df2.select(df2.columns[1:3]).show(5)

+------+--------------+
|  from| Category Name|
+------+--------------+
|France|Dairy Products|
|France|Grains/Cereals|
|France|Dairy Products|
|France|       Produce|
|France|       Produce|
+------+--------------+
only showing top 5 rows



In [24]:
df2.groupby('Category Name').sum().show() #only OrderID culumn has numeric dtype, by default all numeric dtype columns are agg

+--------------+------------+
| Category Name|sum(OrderID)|
+--------------+------------+
|Dairy Products|     5236636|
|  Meat/Poultry|     1839680|
|    Condiments|     3851954|
|     Beverages|     2763695|
|Grains/Cereals|     2090139|
|       Seafood|     3523066|
|   Confections|     2215284|
|       Produce|     1450501|
+--------------+------------+



In [25]:
df2.printSchema()

root
 |-- OrderID: integer (nullable = true)
 |-- from: string (nullable = true)
 |-- Category Name: string (nullable = true)
 |-- sales: string (nullable = true)



In [26]:
# converting the data type of the sales column from string to float to make agregation:
from pyspark.sql.types import FloatType

df2=df2.withColumn('sales', df2.sales.cast(FloatType()))
df2.printSchema()

root
 |-- OrderID: integer (nullable = true)
 |-- from: string (nullable = true)
 |-- Category Name: string (nullable = true)
 |-- sales: float (nullable = true)



In [27]:
df2.groupby('Category Name').sum().show(5)

+--------------+------------+----------+
| Category Name|sum(OrderID)|sum(sales)|
+--------------+------------+----------+
|Dairy Products|     5236636|  267590.0|
|  Meat/Poultry|     1839680|  100210.0|
|    Condiments|     3851954|  155173.0|
|     Beverages|     2763695|  170051.0|
|Grains/Cereals|     2090139|   84042.0|
+--------------+------------+----------+
only showing top 5 rows



In [28]:
df2.groupby('Category Name').sum('sales').show(5)

+--------------+----------+
| Category Name|sum(sales)|
+--------------+----------+
|Dairy Products|  267590.0|
|  Meat/Poultry|  100210.0|
|    Condiments|  155173.0|
|     Beverages|  170051.0|
|Grains/Cereals|   84042.0|
+--------------+----------+
only showing top 5 rows



UDF -> PYTHON IN WORKER AND PYTHON IN DRIVER MUST HAVE THE SAME VERSION (in case of PySpark 3.. Python version = 3.8 or lower)

we have to install the same version of Python as this one in the Spark driver -> then environmnent variables should be added:

PYSPARK_PYTHON -> with exact path to the python.exe file with appropriate version (corresponding to driver)

PYSPARK_DRIVER_PYTHON -> the same as pyspark_python

Linux -> bashrc file modification

In [40]:
# UDF with defined dtype:
from pyspark.sql.types import FloatType
from pyspark.sql import udf  
from pyspark.sql.functions import udf
import numpy as np


def sqr(s):
    if s is not None:
        return float(s)*float(s)
    else:
        return 0

myUDF=udf(lambda x: sqr(x), FloatType())

# function might be used without registration but will not be visible in spark.catalog.listFunctions()
spark.udf.register('myUDF', myUDF) 

<function __main__.<lambda>(x)>

In [41]:
df2.select('from', myUDF('sales')).show(5)

+------+---------------+
|  from|<lambda>(sales)|
+------+---------------+
|France|        28224.0|
|France|         9604.0|
|France|        30276.0|
|France|           null|
|France|      2876416.0|
+------+---------------+
only showing top 5 rows



In [77]:
# checking registered UDFs:
# spark.catalog.listFunctions()

def udf_checker(udfname):
    for fn in spark.catalog.listFunctions():
        if fn.name == udfname:
            print('Function: ', udfname, ' is already registerd.')
            break
    else:
        print('Function: ', udfname, ' is not registerd yet.')


def udf_checker2(udfname):
    if udfname in [fn.name for fn in spark.catalog.listFunctions()]:
        print('Function: ', udfname, ' is already registerd.')
    else:
        print('Function: ', udfname, ' is not registerd yet.')
        
            
            
udf_checker2('myUDF')

Function:  myUDF2  is not registerd yet.


In [75]:
def cube(c):
    if c is not None:
        return float(c)*float(c)*float(c)
    else:
        return 0

# function has not been registered:    
myUDF2=udf(lambda x: cube(x), FloatType())

In [76]:
df2.select('from', myUDF2('sales')).show(5)

+------+---------------+
|  from|<lambda>(sales)|
+------+---------------+
|France|      4741632.0|
|France|       941192.0|
|France|      5268024.0|
|France|           null|
|France|    4.8784015E9|
+------+---------------+
only showing top 5 rows



In [78]:
# and is not visible in the spark catalog:
udf_checker('myUDF2')

Function:  myUDF2  is not registerd yet.


In [None]:
# removing NULL values or adding if else into UDF to avoid None val - error when calling UDF on column with null values:
# df2 =df2.filter('sales is not NULL')
# df2.filter('sales is NULL').show()

In [None]:
spark.sql('from dfsql select * limit 5').show()