In [0]:
# Display is our action
# dbutils = database util
# fs = file system
# ls = list
display(dbutils.fs.ls('/databricks-datasets/'))

# Linux command equivalent
# user@linux dbfs/: databricks-datasets

path,name,size,modificationTime
dbfs:/databricks-datasets/COVID/,COVID/,0,1721064981407
dbfs:/databricks-datasets/README.md,README.md,976,1532502324000
dbfs:/databricks-datasets/Rdatasets/,Rdatasets/,0,1721064981407
dbfs:/databricks-datasets/SPARK_README.md,SPARK_README.md,3359,1455505834000
dbfs:/databricks-datasets/adult/,adult/,0,1721064981407
dbfs:/databricks-datasets/airlines/,airlines/,0,1721064981407
dbfs:/databricks-datasets/amazon/,amazon/,0,1721064981407
dbfs:/databricks-datasets/asa/,asa/,0,1721064981407
dbfs:/databricks-datasets/atlas_higgs/,atlas_higgs/,0,1721064981407
dbfs:/databricks-datasets/bikeSharing/,bikeSharing/,0,1721064981407


In [0]:
display(dbutils.fs.ls('/databricks-datasets/nyctaxi/tripdata/yellow/'))

path,name,size,modificationTime
dbfs:/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_2009-01.csv.gz,yellow_tripdata_2009-01.csv.gz,504262564,1590525201000
dbfs:/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_2009-02.csv.gz,yellow_tripdata_2009-02.csv.gz,480034681,1590525201000
dbfs:/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_2009-03.csv.gz,yellow_tripdata_2009-03.csv.gz,521102719,1590525201000
dbfs:/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_2009-04.csv.gz,yellow_tripdata_2009-04.csv.gz,515435466,1590525201000
dbfs:/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_2009-05.csv.gz,yellow_tripdata_2009-05.csv.gz,531133739,1590525201000
dbfs:/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_2009-06.csv.gz,yellow_tripdata_2009-06.csv.gz,508802995,1590525251000
dbfs:/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_2009-07.csv.gz,yellow_tripdata_2009-07.csv.gz,487731497,1590525252000
dbfs:/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_2009-08.csv.gz,yellow_tripdata_2009-08.csv.gz,490825210,1590525253000
dbfs:/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_2009-09.csv.gz,yellow_tripdata_2009-09.csv.gz,503121179,1590525254000
dbfs:/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_2009-10.csv.gz,yellow_tripdata_2009-10.csv.gz,567109604,1590525255000


In [0]:
# # Reading a CSV file into a Spark DataFrame
file_path = 'dbfs:/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_2019-12.csv.gz'

# Won't look good. Didn't pass in a schema
# df = spark.read.csv(file_path)
# df.head()

df = spark.read.options(header=True,inferSchema=True).csv(file_path)
# Same
# df = spark.read.csv(file_path, header=True, inferSchema=True) 

df.head() # Very messy

Row(VendorID=1, tpep_pickup_datetime=datetime.datetime(2019, 12, 1, 0, 26, 58), tpep_dropoff_datetime=datetime.datetime(2019, 12, 1, 0, 41, 45), passenger_count=1, trip_distance=4.2, RatecodeID=1, store_and_fwd_flag='N', PULocationID=142, DOLocationID=116, payment_type=2, fare_amount=14.5, extra=3.0, mta_tax=0.5, tip_amount=0.0, tolls_amount=0.0, improvement_surcharge=0.3, total_amount=18.3, congestion_surcharge=2.5)

In [0]:
df.show(10) # Cleaner

# show() defaults to 20

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+
|       1| 2019-12-01 00:26:58|  2019-12-01 00:41:45|              1|          4.2|         1|                 N|         142|         116|           2|       14.5|  3.0|    0.5|       0.0|         0.0|                  0.3

In [0]:
# Two ways to filter our data
df.where('passenger_count = 2').show(10) # Can do either SQL or Python style conditionals

df.filter('passenger_count = 2').show(10)

# where and filter is the same

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+
|       1| 2019-12-01 00:12:03|  2019-12-01 00:33:19|              2|          9.4|         1|                 N|         138|          25|           1|       28.5|  0.5|    0.5|      10.0|         0.0|                  0.3

In [0]:
# Getting distinct rows
df.distinct()

# Counting distinct rows
df.distinct().count()

6896317

In [0]:
# Count how many distinct vendor ids we have
# Use select like you would in SQL
df.select('VendorID').distinct().count()

3

In [0]:
# Get first 5 rows of vendor id
df.select('VendorID').show(5)

+--------+
|VendorID|
+--------+
|       1|
|       1|
|       1|
|       1|
|       1|
+--------+
only showing top 5 rows



In [0]:
# Can pass in list of columns
df.select(['VendorID', 'passenger_count', 'fare_amount', 'tip_amount']).show()

+--------+---------------+-----------+----------+
|VendorID|passenger_count|fare_amount|tip_amount|
+--------+---------------+-----------+----------+
|       1|              1|       14.5|       0.0|
|       1|              1|        2.5|       0.0|
|       1|              1|        2.5|       0.0|
|       1|              2|       28.5|      10.0|
|       1|              2|        9.0|       0.0|
|       1|              2|        6.5|       0.0|
|       1|              0|       10.0|       0.0|
|       1|              0|        4.0|      1.55|
|       1|              0|        7.5|       0.0|
|       1|              1|        5.5|      1.85|
|       1|              3|       18.0|      4.35|
|       2|              1|       12.0|       0.0|
|       2|              1|        9.0|       0.0|
|       2|              1|       38.5|       0.0|
|       1|              0|       52.0|      5.53|
|       1|              1|       15.0|       2.5|
|       1|              1|       15.0|       0.0|


In [0]:
# Can also do it like this
df.select(df.VendorID, df.passenger_count, df.fare_amount, df.tip_amount).show()

+--------+---------------+-----------+----------+
|VendorID|passenger_count|fare_amount|tip_amount|
+--------+---------------+-----------+----------+
|       1|              1|       14.5|       0.0|
|       1|              1|        2.5|       0.0|
|       1|              1|        2.5|       0.0|
|       1|              2|       28.5|      10.0|
|       1|              2|        9.0|       0.0|
|       1|              2|        6.5|       0.0|
|       1|              0|       10.0|       0.0|
|       1|              0|        4.0|      1.55|
|       1|              0|        7.5|       0.0|
|       1|              1|        5.5|      1.85|
|       1|              3|       18.0|      4.35|
|       2|              1|       12.0|       0.0|
|       2|              1|        9.0|       0.0|
|       2|              1|       38.5|       0.0|
|       1|              0|       52.0|      5.53|
|       1|              1|       15.0|       2.5|
|       1|              1|       15.0|       0.0|


In [0]:
# Sort defaults to ascending order
# orderBy() and sort() is the same here
df.select(df.VendorID, df.passenger_count, df.fare_amount, df.tip_amount).sort('fare_amount', ascending=False).show()

# Another way
df.select(df.VendorID, df.passenger_count, df.fare_amount, df.tip_amount).sort(df.fare_amount.desc()).show()

+--------+---------------+-----------+----------+
|VendorID|passenger_count|fare_amount|tip_amount|
+--------+---------------+-----------+----------+
|       1|              1|  398468.38|       0.0|
|       1|              1|     6012.5|       0.0|
|       2|              1|     2442.5|       0.0|
|       2|              1|     1961.5|       0.0|
|       2|              2|     1472.0|       0.0|
|       1|              0|      900.0|       0.0|
|       1|              1|      844.0|       0.0|
|       2|              1|      743.0|       0.0|
|       2|              1|      709.5|       0.0|
|       2|              4|      700.0|       0.0|
|       1|              1|      700.0|      20.0|
|       2|              2|      650.0|      20.0|
|       2|              1|      609.5|       0.0|
|       1|              1|      600.0|       0.0|
|       2|              1|      600.0|       0.0|
|       1|              2|      565.0|       0.0|
|       2|              6|      544.5|       0.0|


In [0]:
from pyspark.sql.functions import col
# When selecting columns themselves there are several ways to do it

# 1. 
df.select(df.VendorID, df.passenger_count, df.fare_amount, df.tip_amount).sort(df.fare_amount.desc()).show()

# 2.
# Most used in test
# col converts string fare_amount into column object -> df.fare_amount
df.select(df.VendorID, df.passenger_count, df.fare_amount, df.tip_amount).sort(col('fare_amount').desc()).show()

# 3.
df.select(df.VendorID, df.passenger_count, df.fare_amount, df.tip_amount).sort(df['fare_amount'].desc()).show()

# Will not work bc string 'fare_amount' does not have desc() method
# df.select(df.VendorID, df.passenger_count, df.fare_amount, df.tip_amount).sort('fare_amount'.desc()).show()

+--------+---------------+-----------+----------+
|VendorID|passenger_count|fare_amount|tip_amount|
+--------+---------------+-----------+----------+
|       1|              1|  398468.38|       0.0|
|       1|              1|     6012.5|       0.0|
|       2|              1|     2442.5|       0.0|
|       2|              1|     1961.5|       0.0|
|       2|              2|     1472.0|       0.0|
|       1|              0|      900.0|       0.0|
|       1|              1|      844.0|       0.0|
|       2|              1|      743.0|       0.0|
|       2|              1|      709.5|       0.0|
|       2|              4|      700.0|       0.0|
|       1|              1|      700.0|      20.0|
|       2|              2|      650.0|      20.0|
|       2|              1|      609.5|       0.0|
|       1|              1|      600.0|       0.0|
|       2|              1|      600.0|       0.0|
|       1|              2|      565.0|       0.0|
|       2|              6|      544.5|       0.0|


In [0]:
# Sorting by multiple options
df.select(df.VendorID, df.passenger_count, df.fare_amount, df.tip_amount).orderBy(['tip_amount', 'passenger_count'], ascending=[False,True]).show()

+--------+---------------+-----------+----------+
|VendorID|passenger_count|fare_amount|tip_amount|
+--------+---------------+-----------+----------+
|       2|              1|       11.0|    404.44|
|       2|              2|       39.5|    372.36|
|       2|              1|       13.0|     323.0|
|       2|              1|       52.0|     300.0|
|       1|              1|       21.5|     300.0|
|       2|              1|       20.0|     300.0|
|       1|              1|        2.5|     297.0|
|       1|              1|      137.0|    289.45|
|       1|              1|       0.01|     250.0|
|       1|              1|       52.0|     239.0|
|       1|              1|        2.5|     229.7|
|       1|              1|        0.0|     225.0|
|       2|              1|       18.5|    222.22|
|       2|              1|       25.0|     220.0|
|       2|              1|        3.5|     200.8|
|       2|              2|       13.5|    200.18|
|       2|              1|        9.0|    200.08|


In [0]:
# Getting rid of columns
# drop() only accepts cols or strings no list this time
# drop() only returns a df object. It won't drop the columns in place
df = df.drop('extra','mta_tax','congestion_surcharge','store_and_fwd_flag')

In [0]:
# Drop nulls
# if there is any null in row we drop
# only looks at our subset
df.dropna(how='any',subset=['VendorID','fare_amount'])
# df.na.drop() == df.dropna()
# df.na.fill()

DataFrame[VendorID: int, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: int, trip_distance: double, RatecodeID: int, PULocationID: int, DOLocationID: int, payment_type: int, fare_amount: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double]

In [0]:
# Returns df with our basic aggregation statistics
df.select(['passenger_count','fare_amount']).describe().show()

+-------+------------------+------------------+
|summary|   passenger_count|       fare_amount|
+-------+------------------+------------------+
|  count|           6845299|           6896317|
|   mean|1.5508773247158378|13.590267515261928|
| stddev|1.1743299363249946|152.26922211518485|
|    min|                 0|           -1472.0|
|    max|                 9|         398468.38|
+-------+------------------+------------------+



In [0]:
df.select(['passenger_count','fare_amount']).summary().show()

+-------+------------------+------------------+
|summary|   passenger_count|       fare_amount|
+-------+------------------+------------------+
|  count|           6845299|           6896317|
|   mean|1.5508773247158378|13.590267515261928|
| stddev|1.1743299363249946|152.26922211518485|
|    min|                 0|           -1472.0|
|    25%|                 1|               6.5|
|    50%|                 1|               9.5|
|    75%|                 2|              15.5|
|    max|                 9|         398468.38|
+-------+------------------+------------------+



In [0]:
nyc_taxi_schema = df.printSchema()
# Can use schema when reading in more csv files in here

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)



In [0]:
# Difference between

# df.head(5)
# Gives us top 5 rows in as a list of row object

# df.show(5)
# Give us first 5 rows in a formatted table structure

# df.take(5)
# Returns 5 fastest rows as list of row object
# Usually the first 5 




In [0]:
# Returns a random sample containing 10% of our data
df.sample(fraction=0.1).show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------+------------+------------+-----------+----------+------------+---------------------+------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|PULocationID|DOLocationID|payment_type|fare_amount|tip_amount|tolls_amount|improvement_surcharge|total_amount|
+--------+--------------------+---------------------+---------------+-------------+----------+------------+------------+------------+-----------+----------+------------+---------------------+------------+
|       1| 2019-12-01 00:25:53|  2019-12-01 00:26:04|              1|          0.0|         1|         145|         145|           2|        2.5|       0.0|         0.0|                  0.3|         3.8|
|       1| 2019-12-01 00:36:16|  2019-12-01 00:53:42|              3|          5.5|         1|          79|         226|           1|       18.0|      4.35|         0.0|           

In [0]:
# Like in SQL
# df.join()