RDD = Resilient Distributed Datasets

it supports in-memory processing computation => it stores the state of memory as an object across the jobs and the object is sharable between those jobs.

In [1]:
from pyspark.sql import SparkSession


spark=SparkSession.builder.appName('SPSQL').getOrCreate()
spark

In [3]:
spark.catalog.listTables()

[]

In [2]:
from pyspark import SparkFiles


url='https://raw.githubusercontent.com/justkacz/csvfiles/main/births.csv'

spark.sparkContext.addFile(url)

df=spark.read.csv(SparkFiles.get('births.csv'), header=True, inferSchema=True)
df.show()

+----+-----+---+------+------+
|year|month|day|gender|births|
+----+-----+---+------+------+
|1969|    1|  1|     F|  4046|
|1969|    1|  1|     M|  4440|
|1969|    1|  2|     F|  4454|
|1969|    1|  2|     M|  4548|
|1969|    1|  3|     F|  4548|
|1969|    1|  3|     M|  4994|
|1969|    1|  4|     F|  4440|
|1969|    1|  4|     M|  4520|
|1969|    1|  5|     F|  4192|
|1969|    1|  5|     M|  4198|
|1969|    1|  6|     F|  4710|
|1969|    1|  6|     M|  4850|
|1969|    1|  7|     F|  4646|
|1969|    1|  7|     M|  5092|
|1969|    1|  8|     F|  4800|
|1969|    1|  8|     M|  4934|
|1969|    1|  9|     F|  4592|
|1969|    1|  9|     M|  4842|
|1969|    1| 10|     F|  4852|
|1969|    1| 10|     M|  5190|
+----+-----+---+------+------+
only showing top 20 rows



In [22]:
spark.catalog.listTables()

# or using sql:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [4]:
spark.catalog.listDatabases()

# or using sql:
spark.sql('show databases').show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [5]:
# current database:

spark.catalog.currentDatabase()

'default'

In [6]:
# creating a new database:
spark.sql('create database sparksql')

DataFrame[]

In [7]:
spark.sql('show databases').show()

+---------+
|namespace|
+---------+
|  default|
| sparksql|
+---------+



In [8]:
#creating temporary table in default database:

df.createOrReplaceTempView('dfsql')

In [9]:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |    dfsql|       true|
+---------+---------+-----------+



In [10]:
query= "from dfsql select * limit 10"

df_10=spark.sql(query)
df_10.show()

+----+-----+---+------+------+
|year|month|day|gender|births|
+----+-----+---+------+------+
|1969|    1|  1|     F|  4046|
|1969|    1|  1|     M|  4440|
|1969|    1|  2|     F|  4454|
|1969|    1|  2|     M|  4548|
|1969|    1|  3|     F|  4548|
|1969|    1|  3|     M|  4994|
|1969|    1|  4|     F|  4440|
|1969|    1|  4|     M|  4520|
|1969|    1|  5|     F|  4192|
|1969|    1|  5|     M|  4198|
+----+-----+---+------+------+



In [40]:
# converting spark DataFrame to a pandas DataFrame:
query=('from dfsql select year, gender, sum(births) as tot_births group by year, gender order by year')
dfs=spark.sql(query)
dfspd=dfs.toPandas()
dfspd  # pandas df cannot be used with sql methods -> its not included in the table/view catalog 
# (first must be converted = .createOrReplaceTempView())

Unnamed: 0,year,gender,tot_births
0,1969,F,1753634
1,1969,M,1846572
2,1970,M,1918636
3,1970,F,1819164
4,1971,F,1736774
...,...,...,...
75,2006,M,2188268
76,2007,F,2111890
77,2007,M,2212118
78,2008,M,2177227


In [11]:
import pandas as pd
import numpy as np

rng=np.random.RandomState(0)

# converting pandas DF to spark DF, spark DF is stored LOCALLY not in the SparkSession catalog -> data cannot be accessed in 
# other context

pddf=pd.DataFrame(rng.randn(12).reshape(4,3), columns=['a', 'b', 'c'])
pddf

Unnamed: 0,a,b,c
0,1.764052,0.400157,0.978738
1,2.240893,1.867558,-0.977278
2,0.950088,-0.151357,-0.103219
3,0.410599,0.144044,1.454274


In [12]:
sdf=spark.createDataFrame(pddf)
sdf.createOrReplaceTempView('sdf')

In [49]:
spark.catalog.listTables()

[Table(name='dfsql', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='sdf', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [16]:
df.select('gender').show()

+------+
|gender|
+------+
|     F|
|     M|
|     F|
|     M|
|     F|
|     M|
|     F|
|     M|
|     F|
|     M|
|     F|
|     M|
|     F|
|     M|
|     F|
|     M|
|     F|
|     M|
|     F|
|     M|
+------+
only showing top 20 rows



In [20]:
# pyspark DF -> filter
df.filter(df['gender']=='F').show()

+----+-----+---+------+------+
|year|month|day|gender|births|
+----+-----+---+------+------+
|1969|    1|  1|     F|  4046|
|1969|    1|  2|     F|  4454|
|1969|    1|  3|     F|  4548|
|1969|    1|  4|     F|  4440|
|1969|    1|  5|     F|  4192|
|1969|    1|  6|     F|  4710|
|1969|    1|  7|     F|  4646|
|1969|    1|  8|     F|  4800|
|1969|    1|  9|     F|  4592|
|1969|    1| 10|     F|  4852|
|1969|    1| 11|     F|  4580|
|1969|    1| 12|     F|  4126|
|1969|    1| 13|     F|  4758|
|1969|    1| 14|     F|  5070|
|1969|    1| 15|     F|  4798|
|1969|    1| 16|     F|  4790|
|1969|    1| 17|     F|  4944|
|1969|    1| 18|     F|  4670|
|1969|    1| 19|     F|  4170|
|1969|    1| 20|     F|  4884|
+----+-----+---+------+------+
only showing top 20 rows



In [26]:
# or using sql query:
spark.sql('from dfsql select * where gender = "F" limit 5').show()

+----+-----+---+------+------+
|year|month|day|gender|births|
+----+-----+---+------+------+
|1969|    1|  1|     F|  4046|
|1969|    1|  2|     F|  4454|
|1969|    1|  3|     F|  4548|
|1969|    1|  4|     F|  4440|
|1969|    1|  5|     F|  4192|
+----+-----+---+------+------+



In [68]:
url2='https://raw.githubusercontent.com/justkacz/csvfiles/main/Dane%20NW2.csv'

spark.sparkContext.addFile(url2)

df2=spark.read.option("delimiter", ";").csv(SparkFiles.get('Dane NW2.csv'), inferSchema=True, header=True)
df2.show()

+-------+-------+--------------+-----+
|OrderID|   from| Category Name|sales|
+-------+-------+--------------+-----+
|  10248| France|Dairy Products|  168|
|  10248| France|Grains/Cereals|   98|
|  10248| France|Dairy Products|  174|
|  10249| France|       Produce|167,4|
|  10249| France|       Produce| 1696|
|  10250|Belarus|       Seafood|   77|
|  10250|Belarus|       Produce| 1484|
|  10250|Belarus|    Condiments|  252|
|  10251| France|Grains/Cereals|100,8|
|  10251| France|Grains/Cereals|  234|
|  10251| France|    Condiments|  336|
|  10252| France|Dairy Products| 2592|
|  10252| France|Dairy Products|   50|
|  10252| France|Dairy Products| 1088|
|  10253|Belarus|Dairy Products|  200|
|  10253|Belarus|     Beverages|604,8|
|  10253|Belarus|   Confections|  640|
|  10254|Belgium|     Beverages|   54|
|  10254|Belgium|  Meat/Poultry|403,2|
|  10254|Belgium|       Produce|  168|
+-------+-------+--------------+-----+
only showing top 20 rows



In [52]:
#df2.filter(df2['Category Name'].contains('Prod')).show()
# or:
#df2.filter(df2['Category Name'].like('%Prod%')).show()
#or
from pyspark.sql.functions import col
df2.filter(col("Category Name").like('%Prod%')).show()

+-------+-------+--------------+-----+
|OrderID|   from| Category Name|sales|
+-------+-------+--------------+-----+
|  10248| France|Dairy Products|  168|
|  10248| France|Dairy Products|  174|
|  10249| France|       Produce|167,4|
|  10249| France|       Produce| 1696|
|  10250|Belarus|       Produce| 1484|
|  10252| France|Dairy Products| 2592|
|  10252| France|Dairy Products|   50|
|  10252| France|Dairy Products| 1088|
|  10253|Belarus|Dairy Products|  200|
|  10254|Belgium|       Produce|  168|
|  10255| France|Dairy Products|486,5|
|  10255| France|Dairy Products| 1320|
|  10258|Belgium|Dairy Products|153,6|
|  10261| France|Dairy Products|  160|
|  10262|  Spain|       Produce|  360|
|  10263|Belgium|       Produce|  288|
|  10266| France|Dairy Products|364,8|
|  10267|Austria|Dairy Products| 3080|
|  10268|Austria|Dairy Products|111,2|
|  10269| France|Dairy Products|  120|
+-------+-------+--------------+-----+
only showing top 20 rows



In [58]:
df2.groupby('Category Name').sum().show() #only OrderID culumn has numeric dtype, by default all numeric dtype columns are agg

+--------------+------------+
| Category Name|sum(OrderID)|
+--------------+------------+
|Dairy Products|     5236636|
|  Meat/Poultry|     1839680|
|    Condiments|     3851954|
|     Beverages|     2763695|
|Grains/Cereals|     2090139|
|       Seafood|     3523066|
|   Confections|     2215284|
|       Produce|     1450501|
+--------------+------------+



In [57]:
df2.printSchema()

root
 |-- OrderID: integer (nullable = true)
 |-- from: string (nullable = true)
 |-- Category Name: string (nullable = true)
 |-- sales: string (nullable = true)



In [106]:
# converting the data type of the sales column from string to float to make agregation:
from pyspark.sql.types import FloatType

df2=df2.withColumn('sales', df2.sales.cast(FloatType()))
df2.printSchema()

root
 |-- OrderID: integer (nullable = true)
 |-- from: string (nullable = true)
 |-- Category Name: string (nullable = true)
 |-- sales: float (nullable = true)



In [107]:
df2.groupby('Category Name').sum().show()

+--------------+------------+----------+
| Category Name|sum(OrderID)|sum(sales)|
+--------------+------------+----------+
|Dairy Products|     5236636|  267590.0|
|  Meat/Poultry|     1839680|  100210.0|
|    Condiments|     3851954|  155173.0|
|     Beverages|     2763695|  170051.0|
|Grains/Cereals|     2090139|   84042.0|
|       Seafood|     3523066|  104883.0|
|   Confections|     2215284|   86398.0|
|       Produce|     1450501|   87330.0|
+--------------+------------+----------+



In [108]:
df2.groupby('Category Name').sum('sales').show()

+--------------+----------+
| Category Name|sum(sales)|
+--------------+----------+
|Dairy Products|  267590.0|
|  Meat/Poultry|  100210.0|
|    Condiments|  155173.0|
|     Beverages|  170051.0|
|Grains/Cereals|   84042.0|
|       Seafood|  104883.0|
|   Confections|   86398.0|
|       Produce|   87330.0|
+--------------+----------+



In [148]:
# UDF with defined dtype:
from pyspark.sql.types import FloatType
from pyspark.sql import udf  
from pyspark.sql import functions as F


def sqr(s):
    return s*s

myUDF=F.udf(lambda x: sqr(x), FloatType())

In [149]:
df2.select('from', myUDF('sales')).show()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "C:\Spark\spark-3.3.0-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 540, in main
RuntimeError: Python in worker has different version 3.9 than that in driver 3.8, PySpark cannot run with different minor versions. Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.


In [123]:
df2.printSchema()

root
 |-- OrderID: integer (nullable = true)
 |-- from: string (nullable = true)
 |-- Category Name: string (nullable = true)
 |-- sales: float (nullable = true)

