In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/10/20 04:27:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Creating PySpark DataFrame

- RDD, list, DataFrame
- TXT, CSV, JSON, ORV, Avro, Parquet, XML formats by reading from HDFS, S3, DBFS, Azure Blob file systems
- Kafka, MongoDB, MySQL, .. reading data from RDBMS Databases and NoSQL databases.

In [8]:
columns = ["language","users_count"]
data = [("Java", 20000), ("Python", 100000), ("Scala", 3000)]

In [10]:
rdd = spark.sparkContext.parallelize(data)

## Create DataFrame from RDD

In [18]:
# toDF()
# create a DataFrame from the existing RDD
dfFromRDD1 = rdd.toDF(columns)

# createDataFrame()
# dfFromRDD2 = spark.createDataFrame(rdd, columns)

In [23]:
dfFromRDD1.printSchema()
dfFromRDD1.show()

root
 |-- language: string (nullable = true)
 |-- users_count: long (nullable = true)

+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



### ✿ PySpark StructType & StructField

- By default, the datatype of these columns infers to the type of data. 
- We can change this behavior by supplying schema, where we can specify a column name, data type, and nullable for each field/column.
- specify the schema to the DataFrame and create complex columns like nested struct, array, and map column
- https://sparkbyexamples.com/pyspark/pyspark-structtype-and-structfield/ 참고

-  StructField: defines column name, column data type, boolean to specify if the field can be nullable or not and metadata.
- StructType: a collection of StructFields

In [21]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

In [70]:
data = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

schema = StructType([ \
    StructField("firstname",StringType(),True), \
    StructField("middlename",StringType(),True), \
    StructField("lastname",StringType(),True), \
    StructField("id", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("salary", IntegerType(), True) \
  ])
 
df = spark.createDataFrame(data=data,schema=schema)

df.printSchema()
df.show(truncate=False) # truncate=False: 모든 컬럼 보고 싶을 때
df.schema.json()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+



'{"fields":[{"metadata":{},"name":"firstname","nullable":true,"type":"string"},{"metadata":{},"name":"middlename","nullable":true,"type":"string"},{"metadata":{},"name":"lastname","nullable":true,"type":"string"},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"gender","nullable":true,"type":"string"},{"metadata":{},"name":"salary","nullable":true,"type":"integer"}],"type":"struct"}'

## Create DataFrame from List Collection

In [31]:
# createDataFrame()
columns = ['firstname', 'middlename', 'lastname', 'id', 'gender', 'salary']
dfFromData2 = spark.createDataFrame(data, columns)

dfFromData2.printSchema()
dfFromData2.show()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    James|          |   Smith|36636|     M|  3000|
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
|    Maria|      Anne|   Jones|39192|     F|  4000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+



In [32]:
schema = StructType([ \
    StructField("firstname",StringType(),True), \
    StructField("middlename",StringType(),True), \
    StructField("lastname",StringType(),True), \
    StructField("id", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("salary", IntegerType(), True) \
  ])
 
dfFromData3 = spark.createDataFrame(data=data,schema=schema)

dfFromData3.printSchema()
dfFromData3.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+



## Create DataFrame from Data sources

- In real-time mostly you create DataFrame from data source files like CSV, Text, JSON, XML e.t.c

In [45]:
schema = StructType([StructField("name", StringType(), True), \
                     StructField("gender", StringType(), True), \
                     StructField("age", IntegerType(), True)])

df_csv = spark.read.csv('./test.csv', schema=schema, header=True)

df_csv.printSchema()
df_csv.show()

root
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: integer (nullable = true)

+-----+------+---+
| name|gender|age|
+-----+------+---+
| Mike|  Male| 24|
|Sujan|Female| 27|
| Deli|  Male| 32|
| Kahl|  Male| 17|
| Rosa|Female| 51|
+-----+------+---+



## Create an Empty DataFrame

In [50]:
df_empty_no_schema = spark.createDataFrame([], StructType([]))

In [51]:
df_empty_with_schema = spark.createDataFrame([], schema)

In [54]:
df_empty_no_schema.printSchema()
df_empty_no_schema.show()

root

++
||
++
++



In [55]:
df_empty_with_schema.printSchema()
df_empty_with_schema.show()

root
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: integer (nullable = true)

+----+------+---+
|name|gender|age|
+----+------+---+
+----+------+---+



## Convert PySpark RDD to DataFrame

In [56]:
rdd = spark.sparkContext.parallelize([("Finance",10),("Marketing",20),("Sales",30),("IT",40)])

In [59]:
# RDD.toDF()

columns = ["dept_name","dept_id"]

df = rdd.toDF(columns)

df.printSchema()
df.show()

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|  Finance|     10|
|Marketing|     20|
|    Sales|     30|
|       IT|     40|
+---------+-------+



In [63]:
# createDataFrame(RDD)

df = spark.createDataFrame(rdd, columns)

df.printSchema()
df.show()

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|  Finance|     10|
|Marketing|     20|
|    Sales|     30|
|       IT|     40|
+---------+-------+



In [64]:
schema = StructType([StructField("dept_name", StringType(), True), \
                    StructField("dept_id", IntegerType(), True)])

df = spark.createDataFrame(rdd, schema)

df.printSchema()
df.show()

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: integer (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|  Finance|     10|
|Marketing|     20|
|    Sales|     30|
|       IT|     40|
+---------+-------+



### ✿ Convert PySpark DataFrame to Pandas

- operations on Pyspark run faster than Pandas due to its distributed nature and parallel execution on multiple cores and machines
- If you are working on a Machine Learning application where you are dealing with larger datasets, PySpark processes operations many times faster than pandas

In [66]:
columns = ['first_name', 'middle_name', 'last_name', 'id', 'gender', 'salary']

pysparkDF = spark.createDataFrame(data = data, schema = columns)
pysparkDF.printSchema()
pysparkDF.show(truncate=False)

root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+----------+-----------+---------+-----+------+------+
|first_name|middle_name|last_name|id   |gender|salary|
+----------+-----------+---------+-----+------+------+
|James     |           |Smith    |36636|M     |3000  |
|Michael   |Rose       |         |40288|M     |4000  |
|Robert    |           |Williams |42114|M     |4000  |
|Maria     |Anne       |Jones    |39192|F     |4000  |
|Jen       |Mary       |Brown    |     |F     |-1    |
+----------+-----------+---------+-----+------+------+



In [69]:
# PySparkDF.toPandas()

pandasDF = pysparkDF.toPandas()
print(type(pandasDF))
print(pandasDF)

<class 'pandas.core.frame.DataFrame'>
  first_name middle_name last_name     id gender  salary
0      James                 Smith  36636      M    3000
1    Michael        Rose            40288      M    4000
2     Robert              Williams  42114      M    4000
3      Maria        Anne     Jones  39192      F    4000
4        Jen        Mary     Brown             F      -1
