## Spark SQL

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

#### RDD VS Data Frame

##### RDD

In [0]:
dataRDD = sc.parallelize([("Brooke", 20), ("Denny", 31), ("Jules", 30),
("TD", 35), ("Brooke", 25)])

agesRDD = (dataRDD.map(lambda x: (x[0], (x[1], 1))) 
           .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) 
           .map(lambda x: (x[0], x[1][0]/x[1][1])))
agesRDD.collect()

##### DataFrame

In [0]:
data_df = spark.createDataFrame([("Brooke", 20), ("Denny", 31), ("Jules", 30),("TD", 35), ("Brooke", 25)], ["name", "age"])
avg_df = data_df.groupBy("name").agg(avg("age"))
avg_df.show()

### Creating DataFrame

In [0]:
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
           [2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter",
    "LinkedIn"]],
           [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web",
    "twitter", "FB", "LinkedIn"]],
           [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568,
    ["twitter", "FB"]],
           [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web",
    "twitter", "FB", "LinkedIn"]],
           [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568,
    ["twitter", "LinkedIn"]]
          ]



#### Defining a schema

##### Using DDL

In [0]:
schema = "`Id` INT, `First` STRING, `Last` STRING, `Url` STRING, `Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>"

In [0]:
blogs_df = spark.createDataFrame(data, schema)
blogs_df.show()
blogs_df.printSchema()

##### Schema Created Programmatically

In [0]:

schema2 = StructType([
  StructField("Id", IntegerType(), False),
  StructField("First", StringType(), False),
  StructField("Last", StringType(), False),
  StructField("Url", StringType(), True),
  StructField("Published", StringType(), False),
  StructField("Hits", IntegerType(), False),
  StructField("Campaigns", ArrayType(StringType(), True), False)
])

In [0]:
blogs_df = spark.createDataFrame(data, schema2)
blogs_df.show()
blogs_df.printSchema()

In [0]:
blogs_df.schema

#### Creating DataFranes from Data Sources

* csv
* json
* text
* jdbc
* orc
* parquet

##### Create a DataFrame from JSON file

In [0]:
people_df = spark.read.json("dbfs:/databricks-datasets/samples/people/people.json")
people_df.show()
people_df.printSchema()

##### Create a DataFrame from the NYT COVID data CSV file

In [0]:
covid_df = spark.read.csv("/databricks-datasets/COVID/covid-19-data/us-counties.csv")
covid_df.show()
covid_df.printSchema()

[Spark Documentation](https://spark.apache.org/docs/latest/)

In [0]:
covid_df_ = spark.read.csv("/databricks-datasets/COVID/covid-19-data/us-counties.csv", header=True, inferSchema=True)
covid_df_.show()

In [0]:
covid_df_.printSchema()

In [0]:
covid_schema = StructType([
  StructField("Date",DateType(),True),
  StructField("county",StringType(),True),
  StructField("state",StringType(),True),
  StructField("fips",IntegerType(),True),
  StructField("cases",IntegerType(),True),
  StructField("deaths",IntegerType(),True)
])

In [0]:
covid_df = spark.read.schema(covid_schema).csv("/databricks-datasets/COVID/covid-19-data/us-counties.csv", header=True)
covid_df.show()

In [0]:
covid_df.printSchema()

In [0]:
patients_df = spark.read.csv("dbfs:/databricks-datasets/rwe/ehr/csv/patients.csv", header=True, inferSchema=True)
patients_df.show()

### DataFrames Operations

#### Transformations

* select
* filter 
* where
* distinct
* dropDuplicates
* sort
* orderBy
* limit
* union
* withColumn
* withColumnRenamed
* drop
* sample
* join
* groupBy
*

#### Action

* show([numRows], [[truncate]])
* head([n])
* first([n])
* take(n)
* takeAsList(n)
* collect()
* conut()

#### Aggregation Function

* count(col)
* countDistinct(col)
* min(col)
* max(col)
* sum(col)
* sumDistinct(col)
* avg(col)

### Working with DataFrames

##### Show our data

###### Covid DataFrame

In [0]:
covid_df.head()

In [0]:
covid_df.show()

###### PatientsDataFrame

In [0]:
patients_df.take(1)

In [0]:
patients_df.show()

##### Counting number of rows

In [0]:
covid_df.count()

In [0]:
patients_df.count()

##### Simple Select

In [0]:
covid_df.select(col("county"), col("cases")).show(5)
covid_df.select(covid_df["county"], covid_df["cases"]).show(5)
covid_df.select("county", "cases").show(5)

In [0]:
covid_df.select(year("Date").alias("Year"), col("cases") + 1).show(5)

##### Get the latest informations

In [0]:
#covid_df.select("Date", "county", "cases", "deaths").orderBy(covid_df["Date"].desc()).show()
covid_df.select("Date", "county", "cases", "deaths").sort(covid_df["Date"].desc()).show()

##### Get the latest informations from Los Angeles

In [0]:
covid_df.sort(covid_df["Date"].desc()).filter(covid_df["county"] == "Los Angeles").show()
#covid_df.sort(covid_df["Date"].desc()).where(covid_df["county"] == "Los Angeles").show()

##### Get informations about counities in the US

In [0]:
covid_df.select("county").distinct().count()
#covid_df.select("county", "state").dropDuplicates(["county"]).show()


##### Compute number of heald people

In [0]:
covid_df_with_heald = covid_df.withColumn("heald", (col('cases')- col('deaths')))
covid_df_with_heald.sort(covid_df_with_heald["Date"].desc()).show()

##### Get counties with the most cases

In [0]:
covid_df.select("county", "cases").groupBy("county").agg(max("cases").alias("max_cases")).sort(col("max_cases").desc()).limit(5).show()

In [0]:
patients_df.show()
patients_df.printSchema()

##### Make patients DF easier to use

In [0]:
new_patients_df = patients_df.drop("SSN", "DRIVERS","PASSPORT")
new_patients_df.show()

In [0]:
new_patients_df.printSchema()

###### Patients with the longest last names

In [0]:
patients_name_DF =new_patients_df.select("LAST").distinct().selectExpr("*", "length(LAST) as length")
patients_name_DF.orderBy(col('length').desc()).limit(10).show()


##### Number of patients from each City

In [0]:
new_patients_df.select("Id", "City").groupBy("City").agg(count("Id").alias("Number of patients")).show()


## Using SQL in Spark SQL

In [0]:
infoDF = spark.sql("select current_date() as today , 1 + 100 as value")
infoDF.show()


In [0]:
covid_df.createOrReplaceTempView("covid")

In [0]:
spark.sql("SELECT * FROM covid").show()

In [0]:
spark.sql("SELECT max(cases) AS max_cases, max(deaths) AS max_deaths, county\
          FROM covid\
          GROUP BY county\
          ORDER BY max_cases DESC\
          LIMIT 10"
         ).show()


