# Chapter 9. Data Sources

In [208]:
from pyspark.sql import SparkSession
from py4j.java_gateway import java_import
java_import(spark._jvm, "org.sqlite.JDBC")

jdbc_jar_path = "/Users/khanhnn/Developer/DE/spark/practice_spark/jars/sqlite-jdbc-3.28.0.jar"

# Initialize the SparkSession with the correct JDBC jar
spark = SparkSession.builder\
    .appName("Data Sources")\
    .config("spark.jars", jdbc_jar_path)\
    .config("spark.driver.extraClassPath", jdbc_jar_path)\
    .config("spark.executor.extraClassPath", jdbc_jar_path)\
    .config("spark.sql.shuffle.partitions", "5")\
    .getOrCreate()

## CSV

### Read

In [209]:
from pyspark.sql.types import StructType, StructField, StringType, LongType

myManualSchema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
    StructField("count", LongType(), False),
])

In [210]:
spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .option("mode", "FAILFAST")\
    .schema(myManualSchema)\
    .load("../data/flight-data/csv/2010-summary.csv")\
    .show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [211]:
myManualSchema = StructType([
  StructField("DEST_COUNTRY_NAME", LongType(), True),
  StructField("ORIGIN_COUNTRY_NAME", LongType(), True),
  StructField("countr", LongType(), False),
])

In [212]:
# spark.read.format("csv")\
#     .option("header", "true")\
#     .option("inferSchema", "true")\
#     .option("mode", "FAILFAST")\
#     .schema(myManualSchema)\
#     .load("../data/flight-data/csv/2010-summary.csv")\
#     .show(5)

> Spark will fail only at job execution time (This is due to lazy evaluation)

### Write

In [213]:
csvFile = spark.read.format("csv")\
  .option("header", "true")\
  .option("mode", "FAILFAST")\
  .option("inferSchema", "true")\
  .load("../data/flight-data/csv/2010-summary.csv")

In [214]:
csvFile.write.format("csv").mode("overwrite").option("sep", "\t")\
  .save("../tmp/my-tsv-file.tsv")

## JSON

### Read

In [215]:
spark.read.format("json").option("mode", "FAILFAST")\
  .option("inferSchema", "true")\
    .load("../data/flight-data/json/2010-summary.json").show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



### Write

In [216]:
csvFile.write.format("json").mode("overwrite").save("../tmp/my-json-file.json")

## Parquet Files

### Read

In [217]:
spark.read.format("parquet")\
  .load("../data/flight-data/parquet/2010-summary.parquet").show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



### Write

In [218]:
csvFile.write.format("parquet").mode("overwrite")\
  .save("../tmp/my-parquet-file.parquet")

## ORC Files

## Read

In [219]:
spark.read.format("orc").load("../data/flight-data/orc/2010-summary.orc").show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



### Write

In [220]:
csvFile.write.format("orc").mode("overwrite").save("../tmp/my-json-file.orc")

## SQL Database

### Reading from SQL Database

In [221]:
driver = "org.sqlite.JDBC"
path = f"/Users/khanhnn/Developer/DE/spark/practice_spark/data/flight-data/jdbc/my-sqlite.db"
url = "jdbc:sqlite:/Users/khanhnn/Developer/DE/spark/practice_spark/data/flight-data/jdbc/my-sqlite.db"
tablename = "flight_info"


In [222]:
dbDataFrame = spark.read.format("jdbc").option("url", url)\
    .option("dbtable", tablename).option("driver",  driver).load()

Py4JJavaError: An error occurred while calling o932.load.
: java.lang.ClassNotFoundException: org.sqlite.JDBC
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:476)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:594)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:527)
	at org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry$.register(DriverRegistry.scala:46)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$1(JDBCOptions.scala:103)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$1$adapted(JDBCOptions.scala:103)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:103)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:41)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:34)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:346)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:229)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:211)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:172)
	at jdk.internal.reflect.GeneratedMethodAccessor66.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [None]:
pgDF = spark.read.format("jdbc")\
    .option("driver", "org.postgresql.Driver")\
    .option("url", "jdbc:postgresql://database_server")\
    .option("dbtable", "schema.tablename")\
    .option("user", "username").option("password", "my-secret-password").load()


Py4JJavaError: An error occurred while calling o851.load.
: java.lang.ClassNotFoundException: org.postgresql.Driver
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:476)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:594)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:527)
	at org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry$.register(DriverRegistry.scala:46)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$1(JDBCOptions.scala:103)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$1$adapted(JDBCOptions.scala:103)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:103)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:41)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:34)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:346)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:229)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:211)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:172)
	at jdk.internal.reflect.GeneratedMethodAccessor66.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [224]:
spark.read.text("../data/flight-data/csv/2010-summary.csv")\
  .selectExpr("split(value, ",") as rows").show(4)

ParseException: 
[PARSE_SYNTAX_ERROR] Syntax error at or near end of input.(line 1, pos 13)

== SQL ==
split(value, 
-------------^^^
