Can completely interoperate between SQL and DataFrames
- can create a DataFrame, manipulate it with SQL, and then manipulate it again as a DataFrame

In [0]:
spark.read.json("/FileStore/tables/2015_summary.json")\
  .createOrReplaceTempView("some_sql_view")

test = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count)
FROM some_sql_view GROUP BY DEST_COUNTRY_NAME
""")\
  .where("DEST_COUNTRY_NAME like 'S%'").where("`sum(count)` > 10")\
  .count() # SQL => DF

print(test)




In [0]:
spark.sql("SELECT 1 + 1").show(

In [0]:
dept=[("Finance", 10), ("Marketing", 20)]
deptColumns=["dept_name","dept_id"]
deptDF = spark.createDataFrame(data = dept, schema= deptColumns)


In [0]:
deptDF.printSchema()

In [0]:
deptDF.show(truncate=False)

In [0]:
deptDF.createOrReplaceTempView('dept')
deptDF2=spark.sql('select * from dept where dept_id=10')
deptDF2.show(truncate=False)
display(deptDF2)

dept_name,dept_id
Finance,10


In [0]:
spark.sql("CREATE TABLE flights(DEST_COUNTRY_NAME STRING,ORIGIN_COUNTRY_NAME STRING,count LONG) USING JSON OPTIONS(path'/FileStore/tables/2015_summary.json')")

In [0]:
spark.sql("CREATE TABLE flights_csv(DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING COMMENT 'remember, the US will be most prevalent' ,count  LONG) USING csv  OPTIONS(header true, path '/data/flight-data/csv/2015-summary.csv')")
       


In [0]:
spark.sql('CREATE TABLE IF NOT EXISTS flights_from_select USING parquet AS SELECT * FROM flights')



In [0]:
display(spark.sql('SELECT * FROM flights_from_select'))

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62
United States,Singapore,1
United States,Grenada,62
Costa Rica,United States,588
Senegal,United States,40
Moldova,United States,1


In [0]:
spark.sql('CREATE TABLE partitioned_flights USING parquet PARTITIONED BY ( DEST_COUNTRY_NAME) AS SELECT DEST_COUNTRY_NAME ,ORIGIN_COUNTRY_NAME, count FROM flights LIMIT 5')

In [0]:
display(spark.sql('SELECT * FROM partitioned_flights'))

ORIGIN_COUNTRY_NAME,count,DEST_COUNTRY_NAME
United States,15,Egypt
Romania,15,United States
Croatia,1,United States
Ireland,344,United States
India,62,United States


In [0]:
spark.sql('INSERT INTO flights_from_select SELECT DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count FROM flights LIMIT 20')

In [0]:
display(spark.sql('SELECT * FROM flights_from_select'))
# will show 20 more rows

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62
United States,Singapore,1
United States,Grenada,62
Costa Rica,United States,588
Senegal,United States,40
Moldova,United States,1


In [0]:
spark.sql("INSERT INTO partitioned_flights PARTITION(DEST_COUNTRY_NAME = 'UNITED STATES') SELECT count, ORIGIN_COUNTRY_NAME FROM flights WHERE DEST_COUNTRY_NAME='UNITED STATES' LIMIT 12")

In [0]:
display(spark.sql('DESCRIBE TABLE flights_csv '))

col_name,data_type,comment
DEST_COUNTRY_NAME,string,
ORIGIN_COUNTRY_NAME,string,"remember, the US will be most prevalent"
count,bigint,


In [0]:
display(spark.sql('SHOW PARTITIONS partitioned_flights'))

partition
DEST_COUNTRY_NAME=Egypt
DEST_COUNTRY_NAME=United States
