# Download Datasets

In [0]:
%sh 
curl -O 'https://raw.githubusercontent.com/masfworld/datahack_docker/master/zeppelin/data/bank.csv'
curl -O 'https://raw.githubusercontent.com/masfworld/datahack_docker/master/zeppelin/data/vehicles.csv'
curl -O 'https://raw.githubusercontent.com/masfworld/datahack_docker/master/zeppelin/data/characters.csv'
curl -O 'https://raw.githubusercontent.com/masfworld/datahack_docker/master/zeppelin/data/netflix_titles.csv'

In [0]:
dbutils.fs.mkdirs("/dataset")
dbutils.fs.cp('file:/databricks/driver/bank.csv','dbfs:/dataset/bank.csv')
dbutils.fs.cp('file:/databricks/driver/vehicles.csv','dbfs:/dataset/vehicles.csv')
dbutils.fs.cp('file:/databricks/driver/characters.csv','dbfs:/dataset/characters.csv')
dbutils.fs.cp('file:/databricks/driver/netflix_titles.csv','dbfs:/dataset/netflix_titles.csv')

# Reading Data with Spark SQL

---



## Example 1

In [0]:
dbutils.fs.head('/dataset/bank.csv')

Converting a RDD to a DataFrame

In [0]:
from pyspark.sql.types import Row
from pyspark.sql.functions import *

bankText = spark.sparkContext.textFile("file:/databricks/driver/bank.csv")

bank = bankText.map(lambda lineaCsv: lineaCsv.split(";"))\
.filter(lambda s: s[0] != "\"age\"") \
.map(lambda row: Row(int(row[0]), row[1].replace("\"", ""), row[2].replace("\"", ""), row[3].replace("\"", ""), row[5].replace("\"", ""))) \
.toDF(["age", "job", "marital", "education", "balance"]) \
.withColumn("age", col("age").cast("int"))

In [0]:
bank.printSchema()

In [0]:
bank.createOrReplaceTempView("bank")

In [0]:

from pyspark.sql.functions import *

bank_grouped = bank\
.groupBy(bank.marital) \
.agg({"balance": "avg"}) \
.select("marital", col("avg(balance)").alias("balance_avg")) \
.orderBy(col("balance_avg").desc())\

bank_grouped.display()


In [0]:
bank_grouped.toPandas()

In [0]:
%sql
SELECT marital, avg(balance) as balance_avg 
FROM bank 
GROUP BY marital

## Example 2

Loading a CSV file as RDD, converting into a DataFrame, applying a specific schema using the method `createDataFrame`

In [0]:
from pyspark.sql.types import *

bankSchema = StructType([
    StructField("age", IntegerType(), False), 
    StructField("job", StringType(), False),
    StructField("marital", StringType(), False),
    StructField("education", StringType(), False),
    StructField("balance", IntegerType(), False)])

bankText = spark.sparkContext.textFile("file:/databricks/driver/bank.csv")

bank = bankText\
.map(lambda s: s.split(";")).filter(lambda s: s[0] != "\"age\"")\
.map(lambda s:(int(s[0]), str(s[1]).replace("\"", ""), str(s[2]).replace("\"", ""), str(s[3]).replace("\"", ""), int(s[5]) ))

bankdf = spark.createDataFrame(bank, bankSchema)
bankdf.createOrReplaceTempView("bank2")

In [0]:
%sql
SELECT * 
FROM bank2 
LIMIT 10

## Exercise 1
Load file `vehicles.csv` to a DataFrame, showing the content and printing the schema.

Use this [documentation](https://spark.apache.org/docs/latest/sql-data-sources-load-save-functions.html) to read data in a DataFrame

---



In [0]:
dbutils.fs.head('/dataset/vehicles.csv')

In [0]:
vehiclesDF_all = spark.read.format("csv")\
  .option("sep", ",") \
  .option("inferSchema", "true") \
  .option("header", "true") \
  .load("/dataset/vehicles.csv")


In [0]:
vehiclesDF_all.display()

In [0]:
vehiclesDF_all.printSchema()

Filter out the previous DafaFrame to get vehicles where the capicity is greater than 70

---



In [0]:
vehiclesDF_all \
  .withColumn("cargo_capacity", col("cargo_capacity").cast("float")) \
  .filter(col("cargo_capacity") > 70) \
  .display()

# Spark SQL. Aggregation Functions

Useful Links:

https://spark.apache.org/docs/latest/api/python/pyspark.sql.html


## Exercise 2

Using the DataFrame with all vehicles loaded in Exercise 1, get the average of passengers by vehicle class


---




In [0]:
from pyspark.sql.types import IntegerType

vehiclesDF_all \
  .withColumn("passengers", col("passengers").cast(IntegerType())) \
  .groupBy("vehicle_class") \
  .avg("passengers") \
  .display()

## Exercise 3

Load the file `characters.csv` getting the most common eye color among all characters

---

In [0]:
characters_df = spark.read.format("csv") \
  .option("sep", ",") \
  .option("inferSchema", "true") \
  .option("header", "true") \
  .load("/dataset/characters.csv") \
  

In [0]:
characters_df \
  .groupBy("eye_color") \
  .count() \
  .orderBy(col("count").desc()) \
  .limit(1) \
  .display()

## Exercise 4

1. Load characters DataFrame into a temporary table
2. Using SQL, get the number of characters by gender


---



In [0]:
characters_df \
.withColumn("gender",
    when(col("gender") == "none", lit("NA")) \
        .otherwise(characters_df.gender)) \
.createOrReplaceTempView("characters")

In [0]:
%sql
SELECT gender, count(*) AS count 
FROM characters 
GROUP BY gender

## Exercise 5

Load `netflix_titles.csv` file in a DataFrame, printing the schema

---



In [0]:
dbutils.fs.head('/dataset/netflix_titles.csv')

In [0]:
netflixDF_all = spark.read.format("csv")\
  .option("sep", ",") \
  .option("inferSchema", "true") \
  .option("header", "true") \
  .load("/dataset/netflix_titles.csv")


In [0]:
netflixDF_all.printSchema()

In [0]:
netflixDF_all.display()

## Exercise 6

Get the year in which most films were added(No TV Shows). Use a UDF to get the year

---



In [0]:
from pyspark.sql.functions import *

netflixDF_movies = netflixDF_all\
  .filter(netflixDF_all.type == "Movie") \

netflixDF_movies.display()

In [0]:
from pyspark.sql.functions import *

def getYearAdded(s): return s[-4:]
getYearAdded_udf = udf(getYearAdded)

display(netflixDF_movies \
  .filter(col("date_added").isNotNull()) \
  .select(getYearAdded_udf("date_added").alias("release_year")) \
  .groupBy(col("release_year")) \
  .count() \
  .orderBy(col("count").desc()) \
  .first())

## Exercise 7

---
Using Dataframe API, split `characters.csv` in 3 different csv

In [0]:
dbutils.fs.rm("dbfs:/tmp/characters_csv/", True)

In [0]:
characters_df.repartition(3).write.option('header',True).csv("/tmp/characters_csv/")

In [0]:
dbutils.fs.ls("dbfs:/tmp/characters_csv/")

In [0]:
dbutils.fs.head('dbfs:/tmp/characters_csv/part-00000-tid-1015221579411064865-f8495037-4a72-456c-a75f-0852c4a88ae7-182-1-c000.csv')