- Author: Ben Du
- Date: 2020-06-17
- Title: Read Multiple Files into a DataFrame in Spark
- Slug: spark-dataframe-read-multiple-files
- Category: Computer Science
- Tags: programming, Scala, Spark, DataFrame, read, multiple, file
- Modified: 2020-06-17


https://hadoop.apache.org/docs/r2.7.2/api/org/apache/hadoop/fs/FileSystem.html#globStatus(org.apache.hadoop.fs.Path)

In [1]:
%%classpath add mvn
org.apache.spark spark-core_2.11 2.3.1
org.apache.spark spark-sql_2.11 2.3.1

In [2]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

val spark = SparkSession.builder()
    .master("local[2]")
    .appName("Spark Column Example")
    .config("spark.some.config.option", "some-value")
    .getOrCreate()

import spark.implicits._

org.apache.spark.sql.SparkSession$implicits$@107f8243

In [21]:
spark.read.parquet(
    "file:///workdir/archives/projects/rilb/src/test/resources/abc_item_1.parquet"
)

[item_id: bigint, site_id: bigint]

In [11]:
spark.read.parquet(
    "/workdir/archives/projects/rilb/src/test/resources/abc_item_[1-2].parquet"
)

[item_id: bigint, site_id: bigint]

In [10]:
spark.read.parquet(
    "/workdir/archives/projects/rilb/src/test/resources/abc_item_*.parquet"
)

[item_id: bigint, site_id: bigint]

In [6]:
spark.read.parquet(
    "/workdir/archives/projects/rilb/src/test/resources/abc_item_{1,2}.parquet"
)

[item_id: bigint, site_id: bigint]

In [13]:
spark.read.parquet(
    "/workdir/archives/projects/rilb/src/test/resources/{abc_item_1.parquet,abc_item_2.parquet}"
)

[item_id: bigint, site_id: bigint]

In [14]:
spark.read.parquet(
    "/workdir/{archives/projects/rilb/src/test/resources/abc_item_1.parquet,archives/projects/rilb/src/test/resources/abc_item_2.parquet}"
)

[item_id: bigint, site_id: bigint]

In [15]:
spark.read.parquet(
    "/{workdir/archives/projects/rilb/src/test/resources/abc_item_1.parquet,workdir/archives/projects/rilb/src/test/resources/abc_item_2.parquet}"
)

[item_id: bigint, site_id: bigint]

In [5]:
spark.read.parquet(
    "/workdir/archives/projects/rilb/src/test/resources/abc_item_1.parquet,/workdir/archives/projects/rilb/src/test/resources/abc_item_2.parquet"
)

org.apache.spark.sql.AnalysisException:  Path does not exist

In [24]:
spark.read.parquet(
    "file:///workdir/archives/projects/rilb/src/test/resources/abc_item_1.parquet,file:///workdir/archives/projects/rilb/src/test/resources/abc_item_2.parquet"
)

org.apache.spark.sql.AnalysisException:  Path does not exist

In [20]:
spark.read.parquet(
    "file://{/workdir/archives/projects/rilb/src/test/resources/abc_item_1.parquet,/workdir/archives/projects/rilb/src/test/resources/abc_item_2.parquet}"
)

java.lang.IllegalArgumentException:  Wrong FS

In [9]:
spark.read.parquet(
    "{file:///workdir/archives/projects/rilb/src/test/resources/abc_item_1.parquet,file:///workdir/archives/projects/rilb/src/test/resources/abc_item_2.parquet}"
)

java.lang.IllegalArgumentException:  java.net.URISyntaxException