# PySpark Quickstart: DataFrame

## Importing libraries

In [1]:
import pandas as pd

from datetime import datetime, date
from collections.abc import Iterator

from environment import print

from pyspark.sql import SparkSession, Row, Column, types
from pyspark.sql.functions import upper, pandas_udf, rand, col, expr

## Checking environment

In [2]:
# Check python version
import sys
print(sys.version)

3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]



In [3]:
!python -V

Python 3.11.9


In [4]:
# Check environment variables
import os
print(
    f"SPARK_HOME = {os.environ['SPARK_HOME']}",
    f"JAVA_HOME = {os.environ['JAVA_HOME']}",
    f"HADOOP_HOME = {os.environ['HADOOP_HOME']}",
    f"hadoop.home.dir = {os.environ['hadoop.home.dir']}",
    f"PYSPARK_PYTHON = {os.environ['PYSPARK_PYTHON']}",
    f"PYSPARK_DRIVER_PYTHON = {os.environ['PYSPARK_DRIVER_PYTHON']}",
)

SPARK_HOME = C:\Spark\spark-3.5.1-bin-hadoop3
JAVA_HOME = C:\Program Files\Java\latest\jre-1.8
HADOOP_HOME = C:\Spark\hadoop-3.3.6
hadoop.home.dir = C:\Spark\hadoop-3.3.6
PYSPARK_PYTHON = C:\ProgramData\anaconda3\envs\ml\python.exe
PYSPARK_DRIVER_PYTHON = C:\ProgramData\anaconda3\envs\ml\python.exe



## Spark session function

In [5]:
# Util functions
def create_spark_session():
    # spark = SparkSession.builder.master('local').appName('ml').getOrCreate()
    spark = SparkSession.builder.getOrCreate()

    # Setting the log level
    spark.sparkContext.setLogLevel("WARN")

    # Printing the environment
    print(f'Spark version: {spark} - {spark.version}')

    # Print the tables in the catalog
    print('Available tables:', spark.catalog.listTables())
    return spark

In [6]:
spark = create_spark_session()

Spark version: <pyspark.sql.session.SparkSession object at 0x00000207E82922D0> - 3.5.1

Available tables:
[]



In [7]:
# Reading data
print('\nReading table:', end='\n')
spark.read.json('data-sources/people.json').show()


Reading table:
+----+--------+
| age|    name|
+----+--------+
|NULL|Prashant|
|  30|   Abdul|
|  19|  Justin|
|  43|    Andy|
+----+--------+



## DataFrame Creation

In [8]:
# Create a PySpark DataFrame from a list of rows
df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),  # noqa
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),  # noqa
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))   # noqa
])
print('Creating a simple spark dataframe:', df)

Creating a simple spark dataframe:
DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]



In [9]:
# Create a PySpark DataFrame from a list of rows
df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),  # noqa
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),  # noqa
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))   # noqa
])
print('Creating a simple spark dataframe:', df)

Creating a simple spark dataframe:
DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]



In [10]:
# Create a PySpark DataFrame with an explicit schema.
df = spark.createDataFrame([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
], schema='a long, b double, c string, d date, e timestamp')
print('Creating a spark dataframe with an schema:', df)

Creating a spark dataframe with an schema:
DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]



In [11]:
# Create a PySpark DataFrame from a pandas DataFrame
pandas_df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [2., 3., 4.],
    'c': ['string1', 'string2', 'string3'],
    'd': [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)],
    'e': [datetime(2000, 1, 1, 12, 0), datetime(2000, 1, 2, 12, 0), datetime(2000, 1, 3, 12, 0)]  # noqa
})
data_schema = types.StructType([
    types.StructField('a', types.IntegerType(), True),
    types.StructField('b', types.FloatType(), True),
    types.StructField('c', types.StringType(), True),
    types.StructField('d', types.DateType(), True),
    types.StructField('e', types.TimestampType(), True),
])
df1 = spark.createDataFrame(pandas_df)
print('Creating a spark dataframe from pandas (Infering schema):', df1)
df1.printSchema()

Creating a spark dataframe from pandas (Infering schema):
DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [12]:
df.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+



In [13]:
df2 = spark.createDataFrame(pandas_df, schema=data_schema)
print('Creating a spark dataframe from pandas (Explicit schema):', df)
print('Object type', type(df2))
df2.printSchema()

Creating a spark dataframe from pandas (Explicit schema):
DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

Object type
<class 'pyspark.sql.dataframe.DataFrame'>

root
 |-- a: integer (nullable = true)
 |-- b: float (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [14]:
df2.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+



## Viewing Data

In [15]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [16]:
df2

a,b,c,d,e
1,2.0,string1,2000-01-01,2000-01-01 12:00:00
2,3.0,string2,2000-02-01,2000-01-02 12:00:00
3,4.0,string3,2000-03-01,2000-01-03 12:00:00


In [17]:
df2.show(1, vertical=True)

-RECORD 0------------------
 a   | 1                   
 b   | 2.0                 
 c   | string1             
 d   | 2000-01-01          
 e   | 2000-01-01 12:00:00 
only showing top 1 row



In [18]:
df2.columns

['a', 'b', 'c', 'd', 'e']

In [19]:
df2.printSchema()

root
 |-- a: integer (nullable = true)
 |-- b: float (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [20]:
df2.select("a", "b", "c").describe().show()

+-------+---+---+-------+
|summary|  a|  b|      c|
+-------+---+---+-------+
|  count|  3|  3|      3|
|   mean|2.0|3.0|   NULL|
| stddev|1.0|1.0|   NULL|
|    min|  1|2.0|string1|
|    max|  3|4.0|string3|
+-------+---+---+-------+



In [21]:
df2.describe().show()

+-------+---+---+-------+
|summary|  a|  b|      c|
+-------+---+---+-------+
|  count|  3|  3|      3|
|   mean|2.0|3.0|   NULL|
| stddev|1.0|1.0|   NULL|
|    min|  1|2.0|string1|
|    max|  3|4.0|string3|
+-------+---+---+-------+



In [22]:
df2.describe()

summary,a,b,c
count,3.0,3.0,3
mean,2.0,3.0,
stddev,1.0,1.0,
min,1.0,2.0,string1
max,3.0,4.0,string3


In [23]:
# DataFrame.collect() collects the distributed data to the driver side as the local data in Python. 
# Note that this can throw an out-of-memory error when the dataset is too large to fit in the driver 
# side because it collects all the data from executors to the driver side.
df2.collect()

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)),
 Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0)),
 Row(a=3, b=4.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))]

In [24]:
# In order to avoid throwing an out-of-memory exception, use DataFrame.take() or DataFrame.tail().
df2.take(2)

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)),
 Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0))]

In [25]:
df2.tail(2)

[Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0)),
 Row(a=3, b=4.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))]

In [26]:
# PySpark DataFrame also provides the conversion back to a pandas DataFrame to leverage pandas API. 
# Note that toPandas also collects all data into the driver side that can easily cause an 
# out-of-memory-error when the data is too large to fit into the driver side.
df2.toPandas()

Unnamed: 0,a,b,c,d,e
0,1,2.0,string1,2000-01-01,2000-01-01 12:00:00
1,2,3.0,string2,2000-02-01,2000-01-02 12:00:00
2,3,4.0,string3,2000-03-01,2000-01-03 12:00:00


## Selecting and Accessing Data

In [27]:
df

a,b,c,d,e
1,2.0,string1,2000-01-01,2000-01-01 12:00:00
2,3.0,string2,2000-02-01,2000-01-02 12:00:00
3,4.0,string3,2000-03-01,2000-01-03 12:00:00


In [28]:
df.a

Column<'a'>

In [29]:
print(type(df.c), type(upper(df.c)), type(df.c.isNull()))
type(df.c) == type(upper(df.c)) == type(df.c.isNull())

<class 'pyspark.sql.column.Column'>
<class 'pyspark.sql.column.Column'>
<class 'pyspark.sql.column.Column'>



True

In [30]:
df.select(df.c).show()

+-------+
|      c|
+-------+
|string1|
|string2|
|string3|
+-------+



In [31]:
# Assign new Column instance.
df.withColumn('upper_c', upper(df.c)).show()

+---+---+-------+----------+-------------------+-------+
|  a|  b|      c|         d|                  e|upper_c|
+---+---+-------+----------+-------------------+-------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|STRING1|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|STRING2|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|STRING3|
+---+---+-------+----------+-------------------+-------+



In [32]:
# To select a subset of rows, use DataFrame.filter().
df.filter(df.a == 1).show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+



## Applying a Function

In [33]:
@pandas_udf('long')
def pandas_plus_one(series: pd.Series) -> pd.Series:
    # Simply plus one by using pandas Series.
    return series + 1

df.select(pandas_plus_one(df.a)).show()

+------------------+
|pandas_plus_one(a)|
+------------------+
|                 2|
|                 3|
|                 4|
+------------------+



In [34]:
df.select(upper(df.c)).show()

+--------+
|upper(c)|
+--------+
| STRING1|
| STRING2|
| STRING3|
+--------+



In [35]:
# DataFrame.mapInPandas allows users directly use the APIs in a pandas DataFrame 
# without any restrictions such as the result length.
def pandas_filter_func(iterator):
    for pandas_df in iterator:
        yield pandas_df[pandas_df.a == 1]

df.mapInPandas(pandas_filter_func, schema=df.schema).show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+



### Another example

![PandasUDF_applyInPandas_and_mapInPandas](images/PandasUDF_applyInPandas_and_mapInPandas.png)

In [36]:
def generate_initial_df(num_rows, num_devices, num_trips):
   return (
       spark.range(num_rows)
       .withColumn('device_id', (rand()*num_devices).cast('int'))
       .withColumn('trip_id', (rand()*num_trips).cast('int'))
       .withColumn('sensor_reading', (rand()*1000))
       # .drop('id')
   )

df_origin = generate_initial_df(5000000, 10000, 50)
print((df_origin.count(), len(df_origin.columns)))
df_origin

(5000000, 4)



id,device_id,trip_id,sensor_reading
0,7085,15,565.3351721177589
1,5614,17,480.9863031229702
2,9443,0,410.92721186155615
3,6285,41,654.0700823683391
4,3706,29,959.4971241264644
5,5069,19,765.6619907260629
6,4725,12,119.47958576777484
7,6267,1,578.3716312782248
8,5814,10,966.2220829633112
9,198,18,164.21666364139497


In [37]:
# pandas_udf
@pandas_udf('double')
def calculate_sqrt(sensor_reading: pd.Series) -> pd.Series:
   return sensor_reading.apply(lambda x: x**0.5)

df_pandas_udf = df_origin.withColumn('sqrt_reading', calculate_sqrt(col('sensor_reading')))
df_pandas_udf

id,device_id,trip_id,sensor_reading,sqrt_reading
0,7085,15,565.3351721177589,23.77677800118761
1,5614,17,480.9863031229702,21.931399935320368
2,9443,0,410.92721186155615,20.271339666177862
3,6285,41,654.0700823683391,25.57479388711352
4,3706,29,959.4971241264644,30.97575058213222
5,5069,19,765.6619907260629,27.670597946666472
6,4725,12,119.47958576777484,10.930671789408684
7,6267,1,578.3716312782248,24.04935823006977
8,5814,10,966.2220829633112,31.084113031632597
9,198,18,164.21666364139497,12.8147049767599


In [38]:
# applyInPandas
def denormalize(pdf: pd.DataFrame) -> pd.DataFrame:
  aggregated_df = pdf.groupby('device_id', as_index=False).agg({
      'id': 'max',
      'trip_id': lambda x: list(x), 
      'sensor_reading': 'mean', 
      'sqrt_reading': 'mean'
  })
  return aggregated_df

expected_schema = 'device_id int, id int, trip_id array<int>, sensor_reading long, sqrt_reading long'
df_applyinpandas = df_pandas_udf.groupBy('device_id').applyInPandas(denormalize, schema=expected_schema)
df_applyinpandas

device_id,id,trip_id,sensor_reading,sqrt_reading
31,4999466,"[21, 23, 1, 24, 4...",503,21
34,4997892,"[12, 21, 9, 19, 4...",492,20
53,4996226,"[32, 23, 23, 36, ...",497,21
65,4997875,"[24, 38, 48, 36, ...",495,21
78,4998781,"[33, 21, 30, 4, 1...",505,21
85,4990461,"[10, 41, 29, 24, ...",504,21
108,4992992,"[1, 35, 21, 45, 4...",498,21
133,4989746,"[7, 14, 19, 34, 2...",452,19
137,4999313,"[28, 27, 29, 35, ...",518,21
148,4975991,"[10, 6, 24, 1, 16...",506,21


In [39]:
# Simple Group list for pandas:
dfx = pd.DataFrame( {
    'a':['A','A','B','B','B','C'], 
    'b':[1,2,5,5,4,6],
    'c':[3,3,3,4,4,4]})
dfx.groupby('a').agg(lambda x: list(x))

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
A,"[1, 2]","[3, 3]"
B,"[5, 5, 4]","[3, 4, 4]"
C,[6],[4]


In [40]:
# mapInPandas
def renormalize(itr: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:
   for pdf in itr:
       # Unpack the list of values from the trip_id column into their own rows
       pdf = pdf.explode('trip_id')
       yield pdf

expected_schema = 'device_id int, id int, trip_id int, sensor_reading long, sqrt_reading long'
df_mapinpandas = df_applyinpandas.mapInPandas(renormalize, schema=expected_schema)
df_mapinpandas

device_id,id,trip_id,sensor_reading,sqrt_reading
31,4999466,21,503,21
31,4999466,23,503,21
31,4999466,1,503,21
31,4999466,24,503,21
31,4999466,43,503,21
31,4999466,23,503,21
31,4999466,17,503,21
31,4999466,37,503,21
31,4999466,29,503,21
31,4999466,9,503,21


## Grouping Data

In [41]:
df = spark.createDataFrame([
    ['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
    ['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
    ['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=['color', 'fruit', 'v1', 'v2'])
df.printSchema()
df

root
 |-- color: string (nullable = true)
 |-- fruit: string (nullable = true)
 |-- v1: long (nullable = true)
 |-- v2: long (nullable = true)



color,fruit,v1,v2
red,banana,1,10
blue,banana,2,20
red,carrot,3,30
blue,grape,4,40
red,carrot,5,50
black,carrot,6,60
red,banana,7,70
red,grape,8,80


In [42]:
df1 = df.groupby('color').avg()
df1.printSchema()
df1.show()

root
 |-- color: string (nullable = true)
 |-- avg(v1): double (nullable = true)
 |-- avg(v2): double (nullable = true)

+-----+-------+-------+
|color|avg(v1)|avg(v2)|
+-----+-------+-------+
|  red|    4.8|   48.0|
| blue|    3.0|   30.0|
|black|    6.0|   60.0|
+-----+-------+-------+



In [43]:
def plus_mean(pandas_df):
    return pandas_df.assign(v1=pandas_df.v1 - pandas_df.v1.mean())

df1 = df.groupby('color').applyInPandas(plus_mean, schema=df.schema)
df1.printSchema()
df1.show()

root
 |-- color: string (nullable = true)
 |-- fruit: string (nullable = true)
 |-- v1: long (nullable = true)
 |-- v2: long (nullable = true)

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|black|carrot|  0| 60|
| blue|banana| -1| 20|
| blue| grape|  1| 40|
|  red|banana| -3| 10|
|  red|carrot| -1| 30|
|  red|carrot|  0| 50|
|  red|banana|  2| 70|
|  red| grape|  3| 80|
+-----+------+---+---+



### Co-grouping and applying a function.

In [44]:
df1 = spark.createDataFrame(
    [(20000101, 1, 1.0), (20000101, 2, 2.0), (20000102, 1, 3.0), (20000102, 2, 4.0)],
    ('time', 'id', 'v1'))

df2 = spark.createDataFrame(
    [(20000101, 1, 'x'), (20000101, 2, 'y')],
    ('time', 'id', 'v2'))

In [45]:
df1

time,id,v1
20000101,1,1.0
20000101,2,2.0
20000102,1,3.0
20000102,2,4.0


In [46]:
df2

time,id,v2
20000101,1,x
20000101,2,y


In [47]:
def merge_ordered(l, r):
    return pd.merge_ordered(l, r)

(df1.groupby('id')
    .cogroup(df2.groupby('id'))
    .applyInPandas(merge_ordered, 
                   schema='time int, id int, v1 double, v2 string')
).show()

+--------+---+---+----+
|    time| id| v1|  v2|
+--------+---+---+----+
|20000101|  1|1.0|   x|
|20000102|  1|3.0|NULL|
|20000101|  2|2.0|   y|
|20000102|  2|4.0|NULL|
+--------+---+---+----+



## Getting Data In/Out

In [48]:
df

color,fruit,v1,v2
red,banana,1,10
blue,banana,2,20
red,carrot,3,30
blue,grape,4,40
red,carrot,5,50
black,carrot,6,60
red,banana,7,70
red,grape,8,80


### CSV

In [49]:
df.write.csv('files/foo.csv', header=True, mode="overwrite")
spark.read.csv('files/foo.csv', header=True).show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|black|carrot|  6| 60|
| blue|banana|  2| 20|
|  red|banana|  1| 10|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
+-----+------+---+---+



### Parquet

In [50]:
df.write.parquet('files/bar.parquet', mode="overwrite")
spark.read.parquet('files/bar.parquet').show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|black|carrot|  6| 60|
| blue|banana|  2| 20|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|  red|banana|  7| 70|
|  red|banana|  1| 10|
|  red|carrot|  3| 30|
|  red| grape|  8| 80|
+-----+------+---+---+



### ORC

In [51]:
df.write.orc('files/zoo.orc', mode="overwrite")
spark.read.orc('files/zoo.orc').show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  7| 70|
|  red| grape|  8| 80|
|black|carrot|  6| 60|
| blue|banana|  2| 20|
|  red|banana|  1| 10|
|  red|carrot|  5| 50|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
+-----+------+---+---+



## Working with SQL

In [52]:
df.createOrReplaceTempView("tableA")
spark.sql("SELECT * from tableA").show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
+-----+------+---+---+



In [53]:
spark.sql("SELECT count(*) from tableA").show()

+--------+
|count(1)|
+--------+
|       8|
+--------+



In [54]:
# UDFs can be registered and invoked in SQL out of the box
@pandas_udf("integer")
def add_one(s: pd.Series) -> pd.Series:
    return s + 1

spark.udf.register("add_one", add_one)
spark.sql("SELECT add_one(v1) FROM tableA").show()

+-----------+
|add_one(v1)|
+-----------+
|          2|
|          3|
|          4|
|          5|
|          6|
|          7|
|          8|
|          9|
+-----------+



In [55]:
# These SQL expressions can directly be mixed and used as PySpark columns.
df.selectExpr('add_one(v1)').show()
df.select(expr('count(*)') > 0).show()

+-----------+
|add_one(v1)|
+-----------+
|          2|
|          3|
|          4|
|          5|
|          6|
|          7|
|          8|
|          9|
+-----------+

+--------------+
|(count(1) > 0)|
+--------------+
|          true|
+--------------+



## Close session

In [56]:
spark.stop()