### Spark fundamentals

In [None]:
# Import pyspark(Spark's python wrapper)
from pyspark.sql import SparkSession

In [None]:
# Create spark session
spark = SparkSession.builder.getOrCreate()

In [None]:
# Define file path
file_path = "./data/people.csv"

# Load data from a csv file
df = spark.read.csv(file_path, header=True)

In [None]:
# Define table name
table_name = "organizations"

# Create an SQL table 
df.createOrReplaceTempView(table_name)

In [None]:
# Define query
query = f"SELECT * FROM {table_name}"

# Query the table
spark.sql(query).show()

In [None]:
# Inspect table schema
result = spark.sql(f"SHOW COLUMNS FROM {table_name}")

result.show()

In [None]:
# Inspect table schema
result = spark.sql(f"SELECT * FROM {table_name} LIMIT 0")

print(result.columns)

In [None]:
# Inspect table schema
result = spark.sql(f"DESCRIBE {table_name}")

result.show()

### Window Function SQL
* OVER and ORDERBY clauses

In [None]:
# Train schedule dataset
df = spark.read.csv("./data/train_schedule.csv", header=True)
df.createOrReplaceTempView("sched")

In [None]:
spark.sql("SELECT * FROM sched ORDER BY time").show()

In [None]:
query = """
SELECT train_id, station, time,
LEAD(time, 1) OVER (ORDER BY time) AS time_next
FROM sched
WHERE train_id=101
"""

spark.sql(query).show()

In [None]:
# Using the PARTITION BY clause in order to improve performance
query = """
SELECT train_id, station, time,
LEAD(time, 1) OVER (PARTITION BY train_id ORDER BY time) AS time_next
FROM sched
"""

spark.sql(query).show()

In [None]:
#The following are example queries
query = """
SELECT 
ROW_NUMBER() OVER (ORDER BY time) AS row,
train_id, 
station, 
time, 
LEAD(time,1) OVER (ORDER BY time) AS time_next 
FROM schedule
"""