In [1]:
from datetime import datetime, timedelta
import uuid
import holidays

# pandas and plotting libraries for visualizations
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# module containing functions for manipulation pyspark dataframes
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql.window import Window

# class which will let us create spark objects
from pyspark.sql import SparkSession

# helper functions for the class
from helpers import display, read_df, write_df

## [PySpark SQL docs](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html)

## Create a Spark Session

In [2]:
spark = (
    SparkSession
    .builder
    .appName('data_processing')
    .master('local[2]')
    .getOrCreate()
)

## Read in data file

In [3]:
df = read_df(spark, '../taxi_2016')

In [4]:
df.columns

['trip_id',
 'taxi_id',
 'start_time',
 'end_time',
 'trip_miles',
 'pickup_census_tract',
 'dropoff_census_tract',
 'fare',
 'tips',
 'trip_total',
 'payment_type',
 'company']

let's take a smaller sample so we can iterate faster

In [5]:
#df = df.sample(fraction=.1)

In [6]:
# only have tip data for credit card transactions
tips = df.where(f.col('payment_type') == 'Credit Card')

# Features

### Columns we get for free

In [7]:
display(
    tips.select(
        'trip_id',
        'company',
        'trip_miles',
        'fare',
    )
)

Unnamed: 0,trip_id,company,trip_miles,fare
0,2d4585c3a01188a7032e7bea0f2ac686a869832c,,0.8,5.5
1,2d458675ac892b200cb039fbbe845e2e90c1131c,Choice Taxi Association,2.6,9.75
2,2d458989ecc7bd5aa9f8657c4bbe430347ab5189,,1.8,10.25
3,2d458a76dea4503ee43e5f6f41166a219eacb7ce,Dispatch Taxi Affiliation,5.1,16.5
4,2d458ce051e1c3df37cbc558f5b6d2b08124f759,Choice Taxi Association,0.9,5.25


### Time based features

In [8]:
display(
    tips.select(
        'start_time',
        'end_time',
        f.month('start_time').alias('start_month'),
        f.dayofweek('start_time').alias('start_day_of_week'),
        f.hour('start_time').alias('start_hour'),
    )
)

Unnamed: 0,start_time,end_time,start_month,start_day_of_week,start_hour
0,2016-12-17 23:30:00,2016-12-17 23:30:00,12,7,23
1,2016-02-20 02:30:00,2016-02-20 02:30:00,2,7,2
2,2016-08-17 18:45:00,2016-08-17 19:00:00,8,4,18
3,2016-07-15 00:15:00,2016-07-15 00:30:00,7,6,0
4,2016-02-01 14:15:00,2016-02-01 14:30:00,2,2,14


In [9]:
display(
    tips
    .withColumn(
        'trip_minutes',
        (f.unix_timestamp(f.col('end_time')) - f.unix_timestamp(f.col('start_time')))/60,
    )
    .select('start_time', 'end_time', 'trip_minutes')
)

Unnamed: 0,start_time,end_time,trip_minutes
0,2016-12-17 23:30:00,2016-12-17 23:30:00,0.0
1,2016-02-20 02:30:00,2016-02-20 02:30:00,0.0
2,2016-08-17 18:45:00,2016-08-17 19:00:00,15.0
3,2016-07-15 00:15:00,2016-07-15 00:30:00,15.0
4,2016-02-01 14:15:00,2016-02-01 14:30:00,15.0


### Location based feature

In [10]:
avg_miles_by_census_tract = (
    tips
    .groupby('dropoff_census_tract')
    .agg(f.avg(f.col('trip_miles')).alias('avg_miles_by_census_tract'))
)

display(avg_miles_by_census_tract, 10)

Unnamed: 0,dropoff_census_tract,avg_miles_by_census_tract
0,17031832600,3.349968
1,17031837400,2.8
2,17031062200,5.407275
3,17031843200,3.635294
4,17031806900,9.846667
5,17031020602,1.61875
6,17031130200,0.8
7,17031241300,3.583516
8,17031150501,1.4
9,17031838000,3.928857


In [11]:
display(
    tips
    .join(avg_miles_by_census_tract, on='dropoff_census_tract', how='left')
    .select('pickup_census_tract', 'dropoff_census_tract', 'avg_miles_by_census_tract')
)

Unnamed: 0,pickup_census_tract,dropoff_census_tract,avg_miles_by_census_tract
0,17031839100,17031832600,3.349968
1,17031081800,17031832600,3.349968
2,17031081800,17031832600,3.349968
3,17031081600,17031832600,3.349968
4,17031980000,17031832600,3.349968


In [12]:
census_block_window = Window().partitionBy('dropoff_census_tract')

display(
    tips
    .withColumn('avg_miles_by_census_tract', f.avg(f.col('trip_miles')).over(census_block_window))
    .select('pickup_census_tract', 'dropoff_census_tract', 'avg_miles_by_census_tract')
)

Unnamed: 0,pickup_census_tract,dropoff_census_tract,avg_miles_by_census_tract
0,17031839100,17031832600,3.349968
1,17031081800,17031832600,3.349968
2,17031081800,17031832600,3.349968
3,17031081600,17031832600,3.349968
4,17031980000,17031832600,3.349968


## User Defined Functions (UDFs)
for sometimes it's helpful to take advandate of other python libraries, and udfs let us do thath

### Example python UDF: adding and verifying a uuid column

In [13]:
create_uuid_udf = f.udf(lambda c: str(uuid.uuid4()), t.StringType())

tips = tips.withColumn('trip_uuid', create_uuid_udf(f.col('trip_id')))

display(tips.select('trip_uuid'))

Unnamed: 0,trip_uuid
0,48aafcbb-2bcb-40d8-9312-fa92c4121d39
1,f4b7446b-db22-450b-9cdd-98f2871843d4
2,9df21b52-8322-48e5-a0a1-7ee3fef38fdf
3,ef02d43b-bde3-4f77-843f-42d8882a8f02
4,f10526b2-879f-493d-90b4-cdec82108332


In [14]:
tips = tips.withColumn(
    'trip_uuid',
    f.when(f.col('trip_uuid').startswith('a'), f.lit('zzzzzzzzzz')).otherwise(f.col('trip_uuid'))
)
# tips.cache() # why is this needed ?

In [15]:
def check_uuid(x):
    """Test if the string passed in is a valid UUID - if not, return None"""
    try:
        uuid.UUID(x)
        return x
    except ValueError:
        return None


check_uuid_udf = f.udf(check_uuid)

tips = (
    tips
    .withColumn('trip_uuid_check', check_uuid_udf(f.col('trip_uuid')))
)
display(
    tips
    .where(f.col('trip_uuid_check').isNull())
    .select('trip_uuid', 'trip_uuid_check')
)

Unnamed: 0,trip_uuid,trip_uuid_check
0,8203b52a-1cfa-45fc-8086-6ef9d5bf6bfa,8203b52a-1cfa-45fc-8086-6ef9d5bf6bfa
1,fa1f367f-f134-4a33-a7a4-e8e48fb6e4ed,fa1f367f-f134-4a33-a7a4-e8e48fb6e4ed
2,zzzzzzzzzz,
3,5abe1f80-357d-49f0-a68f-f1c79713646d,5abe1f80-357d-49f0-a68f-f1c79713646d
4,0a340142-af81-4bca-9b59-828990e67c71,0a340142-af81-4bca-9b59-828990e67c71


### Example pandas (vectorized) UDF: finding holidays

In [16]:
from pandas.tseries.holiday import USFederalHolidayCalendar

In [17]:
cal = USFederalHolidayCalendar()
holiday_list = cal.holidays(start='2016-01-01', end='2017-01-01')

In [18]:
@f.pandas_udf('integer')
def holiday_udf(x):
    return x.isin(holiday_list)


tips = tips.withColumn('is_holiday', holiday_udf(f.col('start_time')))

display(tips.select('is_holiday', 'start_time').where('is_holiday = 1'))

Unnamed: 0,is_holiday,start_time
0,1,2016-05-30
1,1,2016-01-01
2,1,2016-05-30
3,1,2016-10-10
4,1,2016-07-04


## Exercises

### Can you build a function that takes in a dataframe with the columns found in `taxi_2016` and output features discussed above?

In [19]:
def build_features(raw_df):
    avg_miles_by_census_tract = (
        raw_df
        .groupby('dropoff_census_tract')
        .agg(f.avg(f.col('trip_miles')).alias('avg_trip_miles_by_dropoff_census_tract'))
    )

    features = (
        raw_df
        .where(f.col('tips').isNotNull())
        .join(avg_miles_by_census_tract, on='dropoff_census_tract')
        .select(
            'trip_id',
            'company',
            'trip_miles',
            'fare',
            f.month('start_time').alias('start_month'),
            f.dayofweek('start_time').alias('start_day_of_week'),
            f.hour('start_time').alias('start_hour'),
            (f.unix_timestamp(f.col('end_time')) - f.unix_timestamp(f.col('start_time'))).alias('trip_minutes'),
            'avg_trip_miles_by_dropoff_census_tract',
            f.col('tips').alias('label'),
        )
    )
    
    return features

In [20]:
display(build_features(tips))

Unnamed: 0,trip_id,company,trip_miles,fare,start_month,start_day_of_week,start_hour,trip_minutes,avg_trip_miles_by_dropoff_census_tract,label
0,2d495c6e8ee5cad52f64de2844f5bdd9999c29f5,,3.3,13.25,7,4,18,900,3.349968,2.06
1,2d4c21df0917fd4c630c704b4d1b501c8aec5e14,,2.1,9.5,12,6,19,900,3.349968,1.5
2,2d4fbb3ea94d4ed30fb45c9c7f278ac75b632fff,Taxi Affiliation Services,0.1,12.25,10,4,16,900,3.349968,3.05
3,2d50340947f1362cdebc258576e8c84423e25f6e,Northwest Management LLC,2.7,10.25,1,5,20,0,3.349968,2.55
4,2d51c476faa582cfa6a4ff99a1e8c08ac2396a0b,Northwest Management LLC,15.0,43.5,8,6,15,4500,3.349968,14.25


### What other data points might be useful to predict what tip a given trip would have?
### Can you construct a column with that information?

Since this prompt is wide open I've not written out a direct solution, but encourage
you to play with different options!

In [21]:
spark.stop()