# DataFrame - SQL Queries

## Prepare the Spark session

In [None]:
# Import findspark
import findspark

# Configure the environment
findspark.init()

# Import the Spark components required for the session creation
from pyspark import SparkConf
from pyspark.sql import SparkSession

# Configure and create the session
conf = SparkConf()
conf = conf.setAppName('mds-session')
conf = conf.setMaster('local[*]')
spark = SparkSession.builder.config(conf = conf).getOrCreate()

## Read sample datasets

In [None]:
# Prepare one dataset
live_df = spark.read.options(sep='\t', header=False, inferSchema=True).csv('./data/live.tsv')
live_df = live_df.select('_c6', '_c1').toDF('location', 'amount')
live_df.show(5)

In [None]:
# Prepare the other dataset
most_df = spark.read.options(sep='\t', header=False, inferSchema=True).csv('./data/most-backed.tsv')
most_df = most_df.select('_c6', '_c1').toDF('location', 'amount')
most_df.show(5)

## Registering tables

In [None]:
# Register table (if not exists)
live_df.createTempView('live')

In [None]:
# Register table (or replace)
most_df.createOrReplaceTempView('most')

## Execute queries

In [None]:
# Execute queries over registered tables
spark.sql('SELECT * FROM live').show(5)

In [None]:
# Queries as complex as in SQL
spark.sql('''
    SELECT l.location, SUM(l.amount) AS live_amount, AVG(m.amount) AS avg_most_amount
    FROM live l, most m
    WHERE l.location = m.location
      AND l.amount > 1000
    GROUP BY l.location
    ORDER BY l.location
''').show(5)

## Close the session

In [None]:
spark.stop()