Read a CSV

In [0]:
csvFile = spark.read.format("csv")\
  .option("header", "true")\
  .option("mode", "FAILFAST")\
  .option("inferSchema", "true")\
  .load("/FileStore/tables/2010_summary.csv")


In [0]:
display(csvFile.head(20))

# tab activates databricks server autocomplete

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,1
United States,Ireland,264
United States,India,69
Egypt,United States,24
Equatorial Guinea,United States,1
United States,Singapore,25
United States,Grenada,54
Costa Rica,United States,477
Senegal,United States,29
United States,Marshall Islands,44


Write CSV as a TSV:

In [0]:
csvFile.write.format("csv").mode("overwrite").option("sep","\t").save("/FileStore/tables/my-tsv-file.tsv")


In [0]:
# in Python
spark.read.format("json").option("mode","FAILFAST")\
.option("inferSchema","true")\
.load("/FileStore/tables/2010_summary.json").show(5)

CANNOT WRITE TO THIS INSTANCE!?

In [0]:
csvFile.write.format("json").mode("overwrite").save("/my-json-file.json")


In [0]:
spark.read.format("parquet")\
  .load("/FileStore/shared_uploads/jo20146230@wipro.com/part_r_00000_1a9822ba_b8fb_4d8e_844a_ea30d0801b9e_gz.parquet").show(5)



In [0]:
csvFile.write.format("parquet").mode("overwrite")\
  .save("/tmp/my-parquet-file.parquet")


In [0]:
%fs 

dbutils.fs.ls("/")

In [0]:
spark.read.format("orc").load("/FileStore/shared_uploads/jo20146230@wipro.com/part_r_00000_2c4f7d96_e703_4de3_af1b_1441d172c80f_snappy.orc").show(5)


In [0]:
csvFile.write.format("orc").mode("overwrite").save("/FileStore/tmp/my-json-file.orc")


In [0]:
driver = "org.sqlite.JDBC"
# path = "dbfs:/FileStore/https:/github.com/databricks/Spark-The-Definitive-Guide/blob/master/data/flight-data/jdbc/my-sqlite.db/my_sqlite.db"
path ="/dbfs/FileStore/tables/my_sqlite.db"
url = "jdbc:sqlite::" + path
tablename = "flight_info"


In [0]:

dbutils.fs.ls("/dbfs/FileStore/tables/my_sqlite.db") 

In [0]:
%sh
pyspark --conf spark.executor.extraClassPath=<jdbc.jar> --driver-class-path <jdbc.jar> --jars <jdbc.jar> --master <master-URL>


In [0]:

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

**CANT CREATE ACCESS TOKEN FOR JDBC DRIVER ON PRACTICE ACCOUNT TO CREATE SQLCONTEXT; WILL HAVE TO PRACTICE WITH DATA FROM SCRATCH TO PRACTICE SPARK SQL COMMANDS**

In [0]:

df = sqlContext.read.format('jdbc').\
     options(url='jdbc:sqlite:/dbfs/FileStore/tables/my_sqlite.db',\
     dbtable='employee',driver='org.sqlite.JDBC').load()

In [0]:
dbDataFrame = spark.read.format("jdbc")\
   .option("url", url)\
  .option("dbtable", tablename).option("driver",  driver).load()



In [0]:
pgDF = spark.read.format("jdbc")\
  .option("driver", "org.postgresql.Driver")\
  .option("url", "jdbc:postgresql://database_server")\
  .option("dbtable", "schema.tablename")\
  .option("user", "username").option("password", "my-secret-password").load()


In [0]:
dbDataFrame.filter("DEST_COUNTRY_NAME in ('Anguilla', 'Sweden')").explain()


In [0]:
pushdownQuery = """(SELECT DISTINCT(DEST_COUNTRY_NAME) FROM flight_info)
  AS flight_info"""
dbDataFrame = spark.read.format("jdbc")\
  .option("url", url).option("dbtable", pushdownQuery).option("driver",  driver)\
  .load()


In [0]:
path ="/FileStore/tables/my_sqlite.db"
url = "jdbc:sqlite::" + path


dbDataFrame = spark.read.format("jdbc")\
  .option("url", url).option("dbtable", tablename).option("driver",  driver)\
  .option("numPartitions", 10).load()


In [0]:
props = {"driver":"org.sqlite.JDBC"}
predicates = [
  "DEST_COUNTRY_NAME = 'Sweden' OR ORIGIN_COUNTRY_NAME = 'Sweden'",
  "DEST_COUNTRY_NAME = 'Anguilla' OR ORIGIN_COUNTRY_NAME = 'Anguilla'"]
spark.read.jdbc(url, tablename, predicates=predicates, properties=props).show()
spark.read.jdbc(url,tablename,predicates=predicates,properties=props)\
  .rdd.getNumPartitions() # 2


In [0]:
props = {"driver":"org.sqlite.JDBC"}
predicates = [
  "DEST_COUNTRY_NAME != 'Sweden' OR ORIGIN_COUNTRY_NAME != 'Sweden'",
  "DEST_COUNTRY_NAME != 'Anguilla' OR ORIGIN_COUNTRY_NAME != 'Anguilla'"]
spark.read.jdbc(url, tablename, predicates=predicates, properties=props).count()


In [0]:
colName = "count"
lowerBound = 0L
upperBound = 348113L # this is the max count in our database
numPartitions = 10


In [0]:
spark.read.jdbc(url, tablename, column=colName, properties=props,
                lowerBound=lowerBound, upperBound=upperBound,
                numPartitions=numPartitions).count() # 255


In [0]:
newPath = "jdbc:sqlite://tmp/my-sqlite.db"
csvFile.write.jdbc(newPath, tablename, mode="overwrite", properties=props)


In [0]:
spark.read.jdbc(newPath, tablename, properties=props).count() # 255


In [0]:
csvFile.write.jdbc(newPath, tablename, mode="append", properties=props)


In [0]:
spark.read.jdbc(newPath, tablename, properties=props).count() # 765


In [0]:

csvFile.limit(10).select("DEST_COUNTRY_NAME", "count")\
  .write.partitionBy("count").text("/tmp/five-csv-files2py.csv")


In [0]:
csvFile.limit(10).write.mode("overwrite").partitionBy("DEST_COUNTRY_NAME")\
  .save("/tmp/partitioned-files.parquet")
