In [None]:
from datetime import datetime, timedelta
import uuid
import holidays

# pandas and plotting libraries for visualizations
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# module containing functions for manipulation pyspark dataframes
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql.window import Window

# class which will let us create spark objects
from pyspark.sql import SparkSession

# helper functions for the class
from helpers import display, read_df, write_df

## [PySpark SQL docs](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html)

## Create a Spark Session

In [None]:
spark = (
    SparkSession
    .builder
    .appName('data_processing')
    .master('local[2]')
    .getOrCreate()
)

## Read in data file

In [None]:
df = read_df(spark, '../taxi_2016')

In [None]:
df.columns

In [None]:
# only have tip data for credit card transactions
tips = df.where(f.col('payment_type') == 'Credit Card')

# Features

### Columns we get for free

In [None]:
display(
    tips.select(
        'trip_id',
        'company',
        'trip_miles',
        'fare',
    )
)

### Time based features

In [None]:
display(
    tips.select(
        'start_time',
        'end_time',
        f.month('start_time').alias('start_month'),
        f.dayofweek('start_time').alias('start_day_of_week'),
        f.hour('start_time').alias('start_hour'),
    )
)

In [None]:
display(
    tips
    .withColumn(
        'trip_minutes',
        (f.unix_timestamp(f.col('end_time')) - f.unix_timestamp(f.col('start_time')))/60,
    )
    .select('start_time', 'end_time', 'trip_minutes')
)

### Location based feature

In [None]:
avg_miles_by_census_tract = (
    tips
    .groupby('dropoff_census_tract')
    .agg(f.avg(f.col('trip_miles')).alias('avg_miles_by_census_tract'))
)

display(avg_miles_by_census_tract, 10)

In [None]:
display(
    tips
    .join(avg_miles_by_census_tract, on='dropoff_census_tract', how='left')
    .select('pickup_census_tract', 'dropoff_census_tract', 'avg_miles_by_census_tract')
)

In [None]:
census_block_window = Window().partitionBy('dropoff_census_tract')

display(
    tips
    .withColumn('avg_miles_by_census_tract', f.avg(f.col('trip_miles')).over(census_block_window))
    .select('pickup_census_tract', 'dropoff_census_tract', 'avg_miles_by_census_tract')
)

## User Defined Functions (UDFs)
for sometimes it's helpful to take advandate of other python libraries, and udfs let us do that

### Example python UDF: adding and verifying a uuid column

In [None]:
create_uuid_udf = f.udf(lambda c: str(uuid.uuid4()), t.StringType())

tips = tips.withColumn('trip_uuid', create_uuid_udf(f.col('trip_id')))

display(tips.select('trip_uuid'))

In [None]:
tips = tips.withColumn(
    'trip_uuid',
    f.when(f.col('trip_uuid').startswith('a'), f.lit('zzzzzzzzzz')).otherwise(f.col('trip_uuid'))
)
# tips.cache() # why is this needed ?

In [None]:
def check_uuid(x):
    """Test if the string passed in is a valid UUID - if not, return None"""
    try:
        uuid.UUID(x)
        return x
    except ValueError:
        return None


check_uuid_udf = f.udf(check_uuid)

tips = (
    tips
    .withColumn('trip_uuid_check', check_uuid_udf(f.col('trip_uuid')))
)
display(
    tips
    .where(f.col('trip_uuid_check').isNull())
    .select('trip_uuid', 'trip_uuid_check')
)

### Example pandas (vectorized) UDF: finding holidays

In [None]:
from pandas.tseries.holiday import USFederalHolidayCalendar

In [None]:
cal = USFederalHolidayCalendar()
holiday_list = cal.holidays(start='2016-01-01', end='2017-01-01')

In [None]:
@f.pandas_udf('integer')
def holiday_udf(x):
    return x.isin(holiday_list)


tips = tips.withColumn('is_holiday', holiday_udf(f.col('start_time')))

display(tips.select('is_holiday', 'start_time').where('is_holiday = 1'))

## Exercises

### Can you build a function that takes in a dataframe with the columns found in `taxi_2016` and output features discussed above?

### What other data points might be useful to predict what tip a given trip would have?
### Can you construct a column with that information?

Since this prompt is wide open I've not written out a direct solution, but encourage
you to play with different options!

In [None]:
spark.stop()