## Task 1
### Create a program that produces a typed parquet file
The parquet file result was saved in directory 'data/green_tripdata_2013-09'

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master('local') \
    .appName('Onemount Test') \
    .getOrCreate()
df = spark.read.csv('data/green_tripdata_2013-09.csv', header = True)

for column_name in df.columns:
    df = df.withColumnRenamed(column_name, column_name.replace(" ", "").lower())

df.printSchema()

df.repartition(1).write.mode('overwrite').parquet('data/green_tripdata_2013-09')

## Task 2
### Create a derived dataset, from the one created above
The parquet file result was saved in directory 'data/nyc_taxi_analysis'

#### One-Hot encoding for each hour of the day
Add 24 columns correspond to 24 hours of the day: **0_hour, 1_hour, 2_hour...**

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, FloatType

df = spark.read.parquet('data/green_tripdata_2013-09')

hour_of_day = F.udf(lambda pickup_hour, dropoff_hour, i: 1 if pickup_hour == i 
                    or dropoff_hour == i else 0, IntegerType())
for i in range(24):
    df = df.withColumn(str(i)+'_hour', hour_of_day(F.hour('lpep_pickup_datetime'), 
                                                   F.hour('lpep_dropoff_datetime'), 
                                                   F.lit(i)))

df.createOrReplaceTempView('nyc_taxi')
df_sql = spark.sql('''
    SELECT sum(3_hour) total_3_hour,sum(18_hour) total_18_hour,sum(15_hour) total_15_hour,sum(0_hour) total_0_hour FROM nyc_taxi
''')

df_sql.show()

#### One-Hot encoding for each day	of the week
Add 7 columns correspond to 7 days of the week: **monday, tuesday, wednesday, thursday, friday, saturday, sunday**

In [None]:
week = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']

day_of_week = F.udf(lambda pickup_day, dropoff_day, d: 1 if pickup_day == d
                    or dropoff_day == d else 0, IntegerType())
for d in week:
    df = df.withColumn(d, day_of_week(F.lower(F.date_format('lpep_pickup_datetime', 'EEEE')), 
                                                   F.lower(F.date_format('lpep_dropoff_datetime', 'EEEE')), 
                                                   F.lit(d)))

df.createOrReplaceTempView('nyc_taxi')
df_sql = spark.sql('''
    SELECT sum(monday) total_monday, sum(sunday) total_sunday FROM nyc_taxi
''')

df_sql.show()

#### Duration in seconds of the trip
Add a column **trip_duration** type long

In [None]:
timeDiff = F.unix_timestamp('lpep_dropoff_datetime')- F.unix_timestamp('lpep_pickup_datetime')
df = df.withColumn('trip_duration', timeDiff)

df.createOrReplaceTempView('nyc_taxi')
df_sql = spark.sql('''
    SELECT lpep_pickup_datetime, lpep_dropoff_datetime, trip_duration FROM nyc_taxi
''')

df_sql.show()

#### An int encoding to indicate if the pickup or dropoff locations were at JFK airport
Add a lolumn **relative_jfk_airport** type int: 1 is pick up or drop off at JFK airport else 0

In [None]:
# no time for researching, so i copied it on internet
import math
def cal_bounding_box(longitude:float, latitude:float):
    R = 6371  # earth radius in km
    radius = 5 # km
    min_lon = longitude - math.degrees(radius/R/math.cos(math.radians(latitude)))
    max_lon = longitude + math.degrees(radius/R/math.cos(math.radians(latitude)))
    min_lat = latitude - math.degrees(radius/R)
    max_lat = latitude + math.degrees(radius/R)
    
    return min_lon, max_lon, min_lat, max_lat
    

In [None]:
longitude_jfk_airport = -73.7787443
latitude_jfk_airport = 40.6398262

min_lon, max_lon, min_lat, max_lat = cal_bounding_box(longitude_jfk_airport, latitude_jfk_airport)

is_jfk_airport = F.udf(lambda p_lon, p_lat, d_lon, d_lat: 
                       1 if (min_lon<=p_lon and p_lon <= max_lon and min_lat <= p_lat and p_lat <= max_lat) 
                           or (min_lon<=d_lon and d_lon <= max_lon and min_lat <= d_lat and d_lat <= max_lat)
                       else 0, IntegerType())

df = df.withColumn('relative_jfk_airport', 
                   is_jfk_airport(df.pickup_longitude.cast(FloatType()), 
                                  df.pickup_latitude.cast(FloatType()), 
                                  df.dropoff_longitude.cast(FloatType()), 
                                  df.dropoff_latitude.cast(FloatType())))

df.createOrReplaceTempView('nyc_taxi')
df_sql = spark.sql('''
    SELECT count(*) no_jfk_airport FROM nyc_taxi WHERE relative_jfk_airport = 1
''')

df_sql.show()

#### Save the new parquet file

In [None]:
df.printSchema()
df.repartition(1).write.mode('overwrite').parquet('data/nyc_taxi_analysis')