### Spark fundamentals

In [None]:
# Import pyspark(Spark's python wrapper)
from pyspark.sql import SparkSession

In [None]:
# Create spark session
spark = SparkSession.builder.getOrCreate()

In [None]:
# Define file path
file_path = "./data/people.csv"

# Load data from a csv file
df = spark.read.csv(file_path, header=True)

# Preview data
df.show()

In [None]:
# Define table name
table_name = "organizations"

# Create an SQL table 
df.createOrReplaceTempView(table_name)

In [None]:
# Define query
query = f"SELECT * FROM {table_name}"

# Query the table
spark.sql(query).show()

In [None]:
# Inspect table schema
result = spark.sql(f"SHOW COLUMNS FROM {table_name}")

result.show()

In [None]:
# Inspect table schema
result = spark.sql(f"SELECT * FROM {table_name} LIMIT 0")

print(result.columns)

In [None]:
# Inspect table schema
result = spark.sql(f"DESCRIBE {table_name}")

result.show()

### Window Function SQL
* OVER and ORDERBY clauses

In [None]:
# Train schedule dataset
df = spark.read.csv("./data/train_schedule.csv", header=True)
df.createOrReplaceTempView("sched")

In [None]:
spark.sql("SELECT * FROM sched ORDER BY time").show()

In [None]:
query = """
SELECT train_id, station, time,
LEAD(time, 1) OVER (ORDER BY time) AS time_next
FROM sched
WHERE train_id=101
"""

spark.sql(query).show()

In [None]:
# Using the PARTITION BY clause in order to improve performance
query = """
SELECT train_id, station, time,
LEAD(time, 1) OVER (PARTITION BY train_id ORDER BY time) AS time_next
FROM sched
"""

spark.sql(query).show()

In [None]:
#The following are example queries
query = """
SELECT 
ROW_NUMBER() OVER (ORDER BY time) AS row,
train_id, 
station, 
time, 
LEAD(time,1) OVER (ORDER BY time) AS time_next 
FROM schedule
"""

# Updated query -> Query did not include PARTITION BY clause as well as bad_row number
query = """
SELECT 
ROW_NUMBER() OVER (ORDER BY time) AS row,
train_id, 
station, 
time, 
LEAD(time,1) OVER (PARTITION BY train_id ORDER BY time) AS time_next 
FROM schedule
"""
spark.sql(query).show()

# Give the number of the bad row as an integer
bad_row = 7

# Provide the missing clause, SQL keywords in upper case
clause = 'PARTITION BY train_id'

In [None]:
# Ways to select 2 columns
df.columns

In [None]:
df.show(5)

In [None]:
# Show only 5 records and 2 columns
df.select('train_id', 'station').show(5)

In [None]:
# The same can be achieved using <dot > notation
df.select(df.train_id, df.station).show(5)

In [None]:
# The <col> function can also be imported
# This enables passing in column names as strings
from pyspark.sql.functions import col

In [None]:
df.select(col('train_id'), col('station')).show(5)

In [None]:
# Using the <withColumnRenamed> function
df.select('train_id', 'station').withColumnRenamed('train_id', 'train').show(5)

* Note -> Avoid using all 3 conventions at the same time

In [None]:
table_name = 'sched'
target_col = 'train_id'
sup_col = 'station'

spark.sql(f'SELECT {target_col} AS train, {sup_col} FROM {table_name} LIMIT 5').show()

In [None]:
# Using dot notation to achieve the same results
df.select(col('train_id').alias('train'), 'station').limit(5).show()

In [None]:
# Using Window Functions to achieve the same results

# The following query adds a number to each stop on a train line -- in a new column called id 
query = """
SELECT *,
ROW_NUMBER() OVER(PARTITION BY train_id ORDER BY time) AS id
FROM sched
"""

spark.sql(query).show(11)


In [None]:
# Using Window Functions to achieve the same results with dot notation
from pyspark.sql import Window
from pyspark.sql.functions import row_number

df.withColumn("id", row_number().over(Window.partitionBy('train_id').orderBy('time'))).show(11)

In [None]:
# Consider the following examples
# Give the identical result in each command
spark.sql('SELECT train_id, MIN(time) AS start FROM sched GROUP BY train_id').show()
df.groupBy('train_id').agg({'time':'min'}).withColumnRenamed('min(time)', 'start').show()

# Print the second column of the result
spark.sql('SELECT train_id, MIN(time), MAX(time) FROM sched GROUP BY train_id').show()
result = df.groupBy('train_id').agg({'time':'min', 'time':'max'})
result.show()
print(result.columns[1])

In [None]:
# Aggregating the same column twice

# There are cases where dot notation can be more cumbersome than SQL. 
# This sample code calculates the first and last times for each train line. 
# The following code does this using dot notation.

from pyspark.sql.functions import min, max, col
expr = [min(col("time")).alias('start'), max(col("time")).alias('end')]
dot_df = df.groupBy("train_id").agg(*expr)
dot_df.show(5)

In [None]:
# Aggregating the same column twice in SQL

query = """
SELECT train_id, MIN(time) as start, MAX(time) as end FROM sched GROUP BY train_id
"""

spark.sql(query).show(5)