# 1、连接到spark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession \
        .builder \
        .appName("spark data source") \
        .master("local[4]") \
        .getOrCreate()

# Generic Load/Save **Functions**

In [4]:
df = spark.read.load("resources/users.parquet")

In [6]:
df.select("name","favorite_color").show()

+------+--------------+
|  name|favorite_color|
+------+--------------+
|Alyssa|          null|
|   Ben|           red|
+------+--------------+



In [7]:
df.select("name","favorite_color").write.save("namesAndFavColor.parquet")

## Manually Specifying Options

In [8]:
df = spark.read.load("resources/people.json",format="json")

In [10]:
df.select("*").show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [11]:
df.select("name","age").write.save("namesAndAges.parquet",format="parquet")

In [12]:
df = spark.read.load("resources/people.csv",
                     format="csv",sep = ";",inferSchema="true",header = "true")

In [15]:
df = spark.read.load("resources/people.csv",format="json")

In [16]:
df = spark.read.orc("resources/users.orc")
df.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



In [19]:
(df.write.format("orc") \
  .option("orc.bloom.filter.columns","favorite_color") \
  .option("orc.dictionary.key.threshold","1.0") \
  .option("orc.column.encoding.direct","name") \
  .save("users_with_options.orc"))

In [20]:
df = spark.read.format("orc").load("users_with_options.orc")

In [21]:
df.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



## Run SQL on files directly

In [26]:
df = spark.sql("select * from parquet.`./resources/users.parquet`")

In [27]:
df.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



## Save Modes

In [28]:
df.write.save("result/test",mode="overwrite")

In [29]:
spark.read.load("result/test").show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



In [32]:
df.write.option("path", "./path").saveAsTable("b")