### Import Libraries and Data ingestion

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import function as func

spark = SparkSession.builder.appName("SparkSQLDataFrame").getOrCreate()

people = spark.read.option("header", "true").option("inferSchema", "true")\
    .csv("./source/fakefriends_header.csv")

### Some popular features

In [None]:
print("Here is our inferred schema:")
people.printSchema()

print("Let's display the name column:")
people.select("name").show()

print("Filter out anyone over 21:")
people.filter(people.age < 21).show()

print("Group by age")
people.groupBy("age").count().show()

print("Make everyone 10 years older:")
people.select(people.name, people.age + 10).show()

### Sort friend by age

In [None]:
# Select only age and numFriends columns
friendsByAge = people.select("age", "friends")

# From friendsByAge we group by "age" and then compute average
friendsByAge.groupBy("age").avg("friends").show()

# Sorted
friendsByAge.groupBy("age").avg("friends").sort("age").show()

# Formatted more nicely
friendsByAge.groupBy("age").agg(func.round(func.avg("friends"), 2)).sort("age").show()

# With a custom column name
friendsByAge.groupBy("age").agg(func.round(func.avg("friends"), 2)
  .alias("friends_avg")).sort("age").show()