In [34]:
from datetime import datetime, timedelta

# pandas and plotting libraries for visualizations
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# module containing functions for manipulation pyspark dataframes
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql.window import Window

# class which will let us create spark objects
from pyspark.sql import SparkSession

# helper functions for the class
from helpers import display, read_df, write_df

## [PySpark SQL docs](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html)

## Create a Spark Session

In [35]:
spark = (
    SparkSession
    .builder
    .appName('data_processing')
    .master('local[2]')
    .getOrCreate()
)

## Read in data file

In [36]:
df = read_df(spark, '../taxi_2016')

In [37]:
df.columns

['trip_id',
 'taxi_id',
 'start_time',
 'end_time',
 'trip_miles',
 'pickup_census_tract',
 'dropoff_census_tract',
 'fare',
 'tips',
 'trip_total',
 'payment_type',
 'company']

In [38]:
# only have tip data for credit card transactions
tips = df.where(f.col('payment_type') == 'Credit Card')

# Features

### Columns we get for free

In [39]:
display(
    df.select(
        'trip_id',
        'company',
        'trip_miles',
        'fare',
    )
)

Unnamed: 0,trip_id,company,trip_miles,fare
0,2d4585c3a01188a7032e7bea0f2ac686a869832c,,0.8,5.5
1,2d458675ac892b200cb039fbbe845e2e90c1131c,Choice Taxi Association,2.6,9.75
2,2d4587774ae4ef68c78e7f328c6b0a12873a50db,Taxi Affiliation Services,0.8,39.5
3,2d458989ecc7bd5aa9f8657c4bbe430347ab5189,,1.8,10.25
4,2d458a76dea4503ee43e5f6f41166a219eacb7ce,Dispatch Taxi Affiliation,5.1,16.5


### Time based features

In [40]:
display(
    df.select(
        'start_time',
        'end_time',
        f.month('start_time').alias('start_month'),
        f.dayofweek('start_time').alias('start_day_of_week'),
        f.hour('start_time').alias('start_hour'),
    )
)

Unnamed: 0,start_time,end_time,start_month,start_day_of_week,start_hour
0,2016-12-17 23:30:00,2016-12-17 23:30:00,12,7,23
1,2016-02-20 02:30:00,2016-02-20 02:30:00,2,7,2
2,2016-02-11 07:15:00,2016-02-11 07:45:00,2,5,7
3,2016-08-17 18:45:00,2016-08-17 19:00:00,8,4,18
4,2016-07-15 00:15:00,2016-07-15 00:30:00,7,6,0


In [41]:
# raw pyspark
display(
    (
        tips
        .withColumn(
            'trip_minutes',
            (f.unix_timestamp(f.col('end_time')) - f.unix_timestamp(f.col('start_time')))/60,
        )
        .select('start_time', 'end_time', 'trip_minutes')
    ),
    10,
)

Unnamed: 0,start_time,end_time,trip_minutes
0,2016-12-17 23:30:00,2016-12-17 23:30:00,0.0
1,2016-02-20 02:30:00,2016-02-20 02:30:00,0.0
2,2016-08-17 18:45:00,2016-08-17 19:00:00,15.0
3,2016-07-15 00:15:00,2016-07-15 00:30:00,15.0
4,2016-02-01 14:15:00,2016-02-01 14:30:00,15.0
5,2016-12-12 18:00:00,2016-12-12 18:00:00,0.0
6,2016-03-19 16:15:00,2016-03-19 16:30:00,15.0
7,2016-11-16 14:30:00,2016-11-16 15:15:00,45.0
8,2016-03-22 20:00:00,2016-03-22 20:15:00,15.0
9,2016-11-24 02:00:00,2016-11-24 02:00:00,0.0


In [42]:
# pandas (vectorized) UDF
def time_diff_pd(start_time_series, end_time_series):
    return (end_time_series - start_time_series)/timedelta(minutes=1)

time_diff_pd_udf = f.udf(time_diff_pd)

display(
    (
        tips
        .withColumn('trip_minutes', time_diff_pd_udf(f.col('start_time'), f.col('end_time')))
        .select('start_time', 'end_time', 'trip_minutes')
    ),
    10,
)

Unnamed: 0,start_time,end_time,trip_minutes
0,2016-12-17 23:30:00,2016-12-17 23:30:00,0.0
1,2016-02-20 02:30:00,2016-02-20 02:30:00,0.0
2,2016-08-17 18:45:00,2016-08-17 19:00:00,15.0
3,2016-07-15 00:15:00,2016-07-15 00:30:00,15.0
4,2016-02-01 14:15:00,2016-02-01 14:30:00,15.0
5,2016-12-12 18:00:00,2016-12-12 18:00:00,0.0
6,2016-03-19 16:15:00,2016-03-19 16:30:00,15.0
7,2016-11-16 14:30:00,2016-11-16 15:15:00,45.0
8,2016-03-22 20:00:00,2016-03-22 20:15:00,15.0
9,2016-11-24 02:00:00,2016-11-24 02:00:00,0.0


In [43]:
# python UDF
time_diff_udf = f.udf(lambda start_time, end_time: (end_time - start_time)/timedelta(minutes=1))

display(
    (
        tips
        .withColumn('trip_minutes', time_diff_udf(f.col('start_time'), f.col('end_time')))
        .select('start_time', 'end_time', 'trip_minutes')
    ),
    10,
)

Unnamed: 0,start_time,end_time,trip_minutes
0,2016-12-17 23:30:00,2016-12-17 23:30:00,0.0
1,2016-02-20 02:30:00,2016-02-20 02:30:00,0.0
2,2016-08-17 18:45:00,2016-08-17 19:00:00,15.0
3,2016-07-15 00:15:00,2016-07-15 00:30:00,15.0
4,2016-02-01 14:15:00,2016-02-01 14:30:00,15.0
5,2016-12-12 18:00:00,2016-12-12 18:00:00,0.0
6,2016-03-19 16:15:00,2016-03-19 16:30:00,15.0
7,2016-11-16 14:30:00,2016-11-16 15:15:00,45.0
8,2016-03-22 20:00:00,2016-03-22 20:15:00,15.0
9,2016-11-24 02:00:00,2016-11-24 02:00:00,0.0


### Location based feature

In [44]:
avg_miles_by_census_tract = (
    tips
    .groupby('dropoff_census_tract')
    .agg(f.avg(f.col('trip_miles')).alias('avg_miles_by_census_tract'))
)

display(avg_miles_by_census_tract, 10)

Unnamed: 0,dropoff_census_tract,avg_miles_by_census_tract
0,17031832600,3.349968
1,17031837400,2.8
2,17031062200,5.407275
3,17031843200,3.635294
4,17031806900,9.846667
5,17031020602,1.61875
6,17031130200,0.8
7,17031241300,3.583516
8,17031150501,1.4
9,17031838000,3.928857


In [45]:
display(
    (
        tips
        .join(avg_miles_by_census_tract, 'dropoff_census_tract', 'left')
        .select('pickup_census_tract', 'dropoff_census_tract', 'avg_miles_by_census_tract')
    ),
    10,
)

Unnamed: 0,pickup_census_tract,dropoff_census_tract,avg_miles_by_census_tract
0,17031839100,17031832600,3.349968
1,17031081800,17031832600,3.349968
2,17031081800,17031832600,3.349968
3,17031081600,17031832600,3.349968
4,17031980000,17031832600,3.349968
5,17031081700,17031832600,3.349968
6,17031071300,17031832600,3.349968
7,17031062100,17031832600,3.349968
8,17031080300,17031832600,3.349968
9,17031320100,17031832600,3.349968


In [46]:
census_block_window = Window().partitionBy('dropoff_census_tract')

display(
    (
        tips
        .withColumn('avg_miles_by_census_tract', f.avg(f.col('trip_miles')).over(census_block_window))
        .select('pickup_census_tract', 'dropoff_census_tract', 'avg_miles_by_census_tract')
    ),
    10
)

Unnamed: 0,pickup_census_tract,dropoff_census_tract,avg_miles_by_census_tract
0,17031839100,17031832600,3.349968
1,17031081800,17031832600,3.349968
2,17031081800,17031832600,3.349968
3,17031081600,17031832600,3.349968
4,17031980000,17031832600,3.349968
5,17031081700,17031832600,3.349968
6,17031071300,17031832600,3.349968
7,17031062100,17031832600,3.349968
8,17031080300,17031832600,3.349968
9,17031320100,17031832600,3.349968


## Exercises

In [47]:
# Can you build a function that takes in a dataframe with the columns found in `taxi_2016`
# and output features discussed above?

In [48]:
def build_features(raw_df):
    avg_miles_by_census_tract = (
        raw_df
        .groupby('dropoff_census_tract')
        .agg(f.avg(f.col('trip_miles')).alias('avg_trip_miles_by_dropoff_census_tract'))
    )

    features = (
        raw_df
        .where(f.col('tips').isNotNull())
        .join(avg_miles_by_census_tract, on='dropoff_census_tract')
        .select(
            'trip_id',
            'company',
            'trip_miles',
            'fare',
            f.month('start_time').alias('start_month'),
            f.dayofweek('start_time').alias('start_day_of_week'),
            f.hour('start_time').alias('start_hour'),
            (f.unix_timestamp(f.col('end_time')) - f.unix_timestamp(f.col('start_time'))).alias('trip_minutes'),
            'avg_trip_miles_by_dropoff_census_tract',
            f.col('tips').alias('label'),
        )
    )
    
    return features

In [49]:
features = build_features(tips)

In [50]:
display(features)

Unnamed: 0,trip_id,company,trip_miles,fare,start_month,start_day_of_week,start_hour,trip_minutes,avg_trip_miles_by_dropoff_census_tract,label
0,2d495c6e8ee5cad52f64de2844f5bdd9999c29f5,,3.3,13.25,7,4,18,900,3.349968,2.06
1,2d4c21df0917fd4c630c704b4d1b501c8aec5e14,,2.1,9.5,12,6,19,900,3.349968,1.5
2,2d4fbb3ea94d4ed30fb45c9c7f278ac75b632fff,Taxi Affiliation Services,0.1,12.25,10,4,16,900,3.349968,3.05
3,2d50340947f1362cdebc258576e8c84423e25f6e,Northwest Management LLC,2.7,10.25,1,5,20,0,3.349968,2.55
4,2d51c476faa582cfa6a4ff99a1e8c08ac2396a0b,Northwest Management LLC,15.0,43.5,8,6,15,4500,3.349968,14.25


In [51]:
# What other data points might be useful to predict what tip a given trip would have?
# Can you construct a column with that information?

# Since this prompt is wide open I've not written out a direct solution, but encourage
# you to play with different options!