RDD = Resilient Distributed Datasets

it supports in-memory processing computation => it stores the state of memory as an object across the jobs and the object is sharable between those jobs.

In [13]:
from pyspark.sql import SparkSession


spark=SparkSession.builder.appName('DCSQL').getOrCreate()
spark

In [12]:
spark.catalog.listTables()

[]

In [17]:
from pyspark import SparkFiles


url='https://raw.githubusercontent.com/justkacz/csvfiles/main/births.csv'

spark.sparkContext.addFile(url)

df=spark.read.csv(SparkFiles.get('births.csv'), header=True, inferSchema=True)
df.show()

+----+-----+---+------+------+
|year|month|day|gender|births|
+----+-----+---+------+------+
|1969|    1|  1|     F|  4046|
|1969|    1|  1|     M|  4440|
|1969|    1|  2|     F|  4454|
|1969|    1|  2|     M|  4548|
|1969|    1|  3|     F|  4548|
|1969|    1|  3|     M|  4994|
|1969|    1|  4|     F|  4440|
|1969|    1|  4|     M|  4520|
|1969|    1|  5|     F|  4192|
|1969|    1|  5|     M|  4198|
|1969|    1|  6|     F|  4710|
|1969|    1|  6|     M|  4850|
|1969|    1|  7|     F|  4646|
|1969|    1|  7|     M|  5092|
|1969|    1|  8|     F|  4800|
|1969|    1|  8|     M|  4934|
|1969|    1|  9|     F|  4592|
|1969|    1|  9|     M|  4842|
|1969|    1| 10|     F|  4852|
|1969|    1| 10|     M|  5190|
+----+-----+---+------+------+
only showing top 20 rows



In [22]:
spark.catalog.listTables()

# or using sql:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [30]:
spark.catalog.listDatabases()

# or using sql:
spark.sql('show databases').show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [23]:
# current database:

spark.catalog.currentDatabase()

'default'

In [31]:
# creating new database:
spark.sql('create database sparksql')

DataFrame[]

In [33]:
spark.sql('show databases').show()

+---------+
|namespace|
+---------+
|  default|
| sparksql|
+---------+



In [35]:
#creating temporary table in default database:

df.createOrReplaceTempView('dfsql')

In [36]:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |    dfsql|       true|
+---------+---------+-----------+



In [37]:
query= "from dfsql select * limit 10"

df_10=spark.sql(query)
df_10.show()

+----+-----+---+------+------+
|year|month|day|gender|births|
+----+-----+---+------+------+
|1969|    1|  1|     F|  4046|
|1969|    1|  1|     M|  4440|
|1969|    1|  2|     F|  4454|
|1969|    1|  2|     M|  4548|
|1969|    1|  3|     F|  4548|
|1969|    1|  3|     M|  4994|
|1969|    1|  4|     F|  4440|
|1969|    1|  4|     M|  4520|
|1969|    1|  5|     F|  4192|
|1969|    1|  5|     M|  4198|
+----+-----+---+------+------+



In [40]:
# converting aggregated dataset to a pandas DataFrame:
query=('from dfsql select year, gender, sum(births) as tot_births group by year, gender order by year')
dfs=spark.sql(query)
dfspd=dfs.toPandas()
dfspd

Unnamed: 0,year,gender,tot_births
0,1969,F,1753634
1,1969,M,1846572
2,1970,M,1918636
3,1970,F,1819164
4,1971,F,1736774
...,...,...,...
75,2006,M,2188268
76,2007,F,2111890
77,2007,M,2212118
78,2008,M,2177227
