In [2]:
# pandas and plotting libraries for visualizations
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# module containing functions for manipulating pyspark dataframes
import pyspark.sql.functions as f

# class which will let us create spark objects
from pyspark.sql import SparkSession

# helper functions for intro class
from helpers import display, read_df

## [PySpark SQL docs](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html)
 - the main functions you'll need to manipulate data in pyspark dataframes are in this module

## [Data Dictionary](https://digital.cityofchicago.org/index.php/chicago-taxi-data-released/)
 - details about the dataset used, here filtered down to just 2016 data

## Create a Spark Session

In [3]:
spark = (
    SparkSession
    .builder
    .appName('data_exploration')
    .master('local[2]')
    .getOrCreate()
)

## Read in data file

In [4]:
df = read_df(spark, '../taxi_2016')

In [5]:
display(df)

Unnamed: 0,trip_id,taxi_id,start_time,end_time,trip_miles,pickup_census_tract,dropoff_census_tract,fare,tips,trip_total,payment_type,company
0,2d4585c3a01188a7032e7bea0f2ac686a869832c,fe33d0d63aa20b97b9d4440ad1b6637ea21a0df546aa0d...,2016-12-17 23:30:00,2016-12-17 23:30:00,0.8,17031280000.0,17031830000.0,5.5,1.0,7.0,Credit Card,
1,2d458675ac892b200cb039fbbe845e2e90c1131c,c12e0923159de80ee3288c44047308ab8f602fbfb2278f...,2016-02-20 02:30:00,2016-02-20 02:30:00,2.6,17031080000.0,17031240000.0,9.75,2.15,12.9,Credit Card,Choice Taxi Association
2,2d4587774ae4ef68c78e7f328c6b0a12873a50db,0083fcde0fb490b4ec424c63d1d750378f6ad11154d1d4...,2016-02-11 07:15:00,2016-02-11 07:45:00,0.8,,,39.5,0.0,39.5,Cash,Taxi Affiliation Services
3,2d458989ecc7bd5aa9f8657c4bbe430347ab5189,874b25eb25690a1a0361023234f59edeee52101733a1ab...,2016-08-17 18:45:00,2016-08-17 19:00:00,1.8,17031080000.0,17031830000.0,10.25,2.0,12.75,Credit Card,
4,2d458a76dea4503ee43e5f6f41166a219eacb7ce,45772661016b0b77e44155eaf32f92d2f4e9e2702c0fc3...,2016-07-15 00:15:00,2016-07-15 00:30:00,5.1,,,16.5,3.0,21.0,Credit Card,Dispatch Taxi Affiliation


In [6]:
df.columns

['trip_id',
 'taxi_id',
 'start_time',
 'end_time',
 'trip_miles',
 'pickup_census_tract',
 'dropoff_census_tract',
 'fare',
 'tips',
 'trip_total',
 'payment_type',
 'company']

In [None]:
display(df, 10)

In [None]:
total_rows = df.count() # ~3 million trips
print(total_rows)

In [None]:
display(df.agg(f.countDistinct('taxi_id')))

In [None]:
display(df.agg((f.sum('trip_miles')/total_rows).alias('trip_miles')))

In [None]:
display(df.agg(*[(f.count(c)/total_rows).alias(c) for c in df.columns]))

In [None]:
trips_per_taxi = df.groupBy('taxi_id').count()

In [None]:
display(trips_per_taxi, 10)

In [None]:
plt.figure()
sns.distplot(trips_per_taxi.select('count').toPandas()).set_title('Trips Per Taxi');

In [None]:
distance_traveled_per_taxi = (
    df
    .groupBy('taxi_id')
    .agg(f.sum('trip_miles').alias('miles'))
)

In [None]:
display(distance_traveled_per_taxi, 10)

In [None]:
plt.figure()
(
    sns
    .distplot(distance_traveled_per_taxi.select('miles').toPandas())
    .set_title('Miles Traveled Per Taxi')
);

In [None]:
plt.figure()
(
    sns
    .distplot(distance_traveled_per_taxi.where('miles < 1000').select('miles').toPandas())
    .set_title('Miles Traveled Per Taxi (capped at 1000)')
);

In [None]:
display(distance_traveled_per_taxi.orderBy(f.desc('miles')), 10)

## Exercises

In [None]:
# when do most trips occur? 

In [None]:
# what's the most common length for a trip in miles? in minutes?

In [None]:
# are there companies that only use cash or only use credit?

In [None]:
spark.stop()