# **PySpark Basic Operations**

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master("local[*]") \
        .appName("My First pyspark app") \
        .getOrCreate()

sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/09 02:09:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## **Load Files**

### **Json**

In [2]:
df_json = spark.read.option("multiline", True).json("data/03.json")

                                                                                

In [3]:
df_json.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- zip: long (nullable = true)
 |-- age: long (nullable = true)
 |-- children: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- hobbies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- married: boolean (nullable = true)
 |-- name: string (nullable = true)



### **CSV**

1. 所有欄位皆為文字
1. 自動推斷欄位的類別 Automatic Inference of Field Types
1. 客製化欄位類別 Custom Field Types
1. 讀取特定目錄下的 CSV 檔

In [4]:
# All columns's type will be string
df_csv_string = spark.read.csv("data/03.csv", header=True) 

In [5]:
# Automatic inference of filed types
df_csv_auto = spark.read.csv("data/03.csv", header=True, inferSchema=True) 

In [6]:
# Custom Field Types
from pyspark.sql.types import *

custom_types = StructType(

    [
        StructField(name='A', dataType=DoubleType(), nullable=True),
        StructField(name='B', dataType=StringType(), nullable=True),
        StructField(name='C', dataType=IntegerType(), nullable=True)
    ]

)

df_csv_custom = spark.read.csv("data/03.csv", header=True, schema=custom_types)
df_csv_custom.printSchema()

root
 |-- A: double (nullable = true)
 |-- B: string (nullable = true)
 |-- C: integer (nullable = true)



In [7]:
# Read all csv file in specific folder
df_csv_folder = spark.read.csv("data/03-many-csv", header=True)
df_csv_folder.printSchema()
df_csv_folder.show()

root
 |-- A: string (nullable = true)
 |--  B: string (nullable = true)
 |--  C: string (nullable = true)

+---+----+---+
|  A|   B|  C|
+---+----+---+
|1.0| "2"|  3|
|4.0| "5"|  6|
|7.0| "8"|  9|
|2.0| "2"|  2|
|2.0| "2"|  2|
|2.0| "2"|  2|
+---+----+---+



### **Pandas**

!!! note "Pandas 的特色[^pandas-1]"
    1. Scalability beyond a **single** machine  
    1. Interactive data visualization  
    1. Leveraging unified analytics functionality in Spark  

    

[^pandas-1]:
    [Pandas API on Upcoming Apache Spark™ 3.2](https://www.databricks.com/blog/2021/10/04/pandas-api-on-upcoming-apache-spark-3-2.html)

### **Parquet**

In [8]:
from pyspark import pandas as ps
import pandas as pd



In [9]:
# From csv file
df = ps.read_csv('data/03.csv')
df



Unnamed: 0,A,B,C
0,1.0,"""2""",3.0
1,4.0,"""5""",6.0
2,7.0,"""8""",9.0


In [10]:
# From dict
df = ps.DataFrame(
    {
        'A': list('123'),
        'B': list('234'),
        'C': list('567')
    }
)

df

  fields = [
  for column, series in pdf.iteritems():
                                                                                

Unnamed: 0,A,B,C
0,1,2,5
1,2,3,6
2,3,4,7


In [11]:
# From DataFrame
df = ps.DataFrame(
    pd.DataFrame(
        {
            'A': list('123'),
            'B': list('234'),
            'C': list('567')
        }        
    )
)
df

  fields = [
  for column, series in pdf.iteritems():


Unnamed: 0,A,B,C
0,1,2,5
1,2,3,6
2,3,4,7


In [12]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row

df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]