In [1]:
#we use the findspark library to locate spark on our local machine
import findspark
findspark.init(r'C:\spark\spark-3.5.0-bin-hadoop3')
import pyspark # only run this after findspark.init()
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

In [2]:
dept = [("Finance",10), ("Marketing",20), ("Sales",30), ("IT",40)]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



deptDF: This assumes that deptDF is a PySpark DataFrame that contains some data.

.collect(): The collect method is called on the DataFrame. This action retrieves all the rows from the DataFrame and returns them as a local Python list.

So, after executing this line of code, the variable dataCollect will contain all the data from the deptDF DataFrame as a Python list. Each element of the list represents a row from the DataFrame, and the rows are stored as structured data (e.g., tuples or Row objects) depending on the structure of the original DataFrame. Keep in mind that using collect() can be memory-intensive and should be used with caution, especially for large DataFrames, as it brings all the data to the driver node, which may cause memory issues.

In [3]:
dataCollect = deptDF.collect()
print(dataCollect)

[Row(dept_name='Finance', dept_id=10), Row(dept_name='Marketing', dept_id=20), Row(dept_name='Sales', dept_id=30), Row(dept_name='IT', dept_id=40)]


In [4]:
dataCollect2 = deptDF.select("dept_name").collect()
print(dataCollect2)

[Row(dept_name='Finance'), Row(dept_name='Marketing'), Row(dept_name='Sales'), Row(dept_name='IT')]


In [5]:
for row in dataCollect:
    print(row['dept_name'] + "," +str(row['dept_id']))

Finance,10
Marketing,20
Sales,30
IT,40
