In [None]:
from datetime import datetime, timedelta

# pandas and plotting libraries for visualizations
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# module containing functions for manipulation pyspark dataframes
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql.window import Window

# class which will let us create spark objects
from pyspark.sql import SparkSession

# helper functions for the class
from helpers import display, read_df, write_df

## [PySpark SQL docs](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html)

## Create a Spark Session

In [None]:
spark = (
    SparkSession
    .builder
    .appName('data_processing')
    .master('local[2]')
    .getOrCreate()
)

## Read in data file

In [None]:
df = read_df(spark, '../taxi_2016')

In [None]:
df.columns

In [None]:
# only have tip data for credit card transactions
tips = df.where(f.col('payment_type') == 'Credit Card')

# Features

### Columns we get for free

In [None]:
display(
    df.select(
        'trip_id',
        'company',
        'trip_miles',
        'fare',
    )
)

### Time based features

In [None]:
display(
    df.select(
        'start_time',
        'end_time',
        f.month('start_time').alias('start_month'),
        f.dayofweek('start_time').alias('start_day_of_week'),
        f.hour('start_time').alias('start_hour'),
    )
)

In [None]:
# raw pyspark
display(
    (
        tips
        .withColumn(
            'trip_minutes',
            (f.unix_timestamp(f.col('end_time')) - f.unix_timestamp(f.col('start_time')))/60,
        )
        .select('start_time', 'end_time', 'trip_minutes')
    ),
    10,
)

In [None]:
# raw pyspark
display(
    (
        tips
        .select(
            'start_time',
            'end_time',
            ((f.unix_timestamp(f.col('end_time')) - f.unix_timestamp(f.col('start_time')))/60).alias('trip_minutes'),
        )
    ),
    10,
)

In [None]:
# pandas (vectorized) UDF
def time_diff_pd(start_time_series, end_time_series):
    return (end_time_series - start_time_series)/timedelta(minutes=1)

time_diff_pd_udf = f.pandas_udf(time_diff_pd, t.IntegerType())

display(
    (
        tips
        .withColumn('trip_minutes', time_diff_pd_udf(f.col('start_time'), f.col('end_time')))
        .select('start_time', 'end_time', 'trip_minutes')
    ),
    10,
)f

In [None]:
# python UDF
time_diff_udf = f.udf(lambda start_time, end_time: (end_time - start_time)/timedelta(minutes=1))

display(
    (
        tips
        .withColumn('trip_minutes', time_diff_udf(f.col('start_time'), f.col('end_time')))
        .select('start_time', 'end_time', 'trip_minutes')
    ),
    10,
)

### Location based feature

In [None]:
avg_miles_by_census_tract = (
    tips
    .groupby('dropoff_census_tract')
    .agg(f.avg(f.col('trip_miles')).alias('avg_miles_by_census_tract'))
)

display(avg_miles_by_census_tract, 10)

In [None]:
display(
    (
        tips
        .join(avg_miles_by_census_tract, on='dropoff_census_tract', how='left')
        .select('pickup_census_tract', 'dropoff_census_tract', 'avg_miles_by_census_tract')
    ),
    10,
)

In [None]:
census_block_window = Window().partitionBy('dropoff_census_tract')

display(
    (
        tips
        .withColumn('avg_miles_by_census_tract', f.avg(f.col('trip_miles')).over(census_block_window))
        .select('pickup_census_tract', 'dropoff_census_tract', 'avg_miles_by_census_tract')
    ),
    10
)

## Exercises

In [None]:
# Can you build a function that takes in a dataframe with the columns found in `taxi_2016`
# and output features discussed above?

In [None]:
# What other data points might be useful to predict what tip a given trip would have?
# Can you construct a column with that information?

# Since this prompt is wide open I've not written out a direct solution, but encourage
# you to play with different options!

In [None]:
spark.stop()